aboutsummaryrefslogtreecommitdiff
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c592
1 files changed, 318 insertions, 274 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5b4a9362d5a..be568b7311d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,7 +14,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -22,11 +22,11 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
+#include <linux/device.h>
#include "internal.h"
/*
@@ -46,17 +46,13 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct completion *done; /* set if the caller waits */
};
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
-
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
@@ -68,12 +64,13 @@ int writeback_in_progress(struct backing_dev_info *bdi)
{
return test_bit(BDI_writeback_running, &bdi->state);
}
+EXPORT_SYMBOL(writeback_in_progress);
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- if (strcmp(sb->s_type->name, "bdev") == 0)
+ if (sb_is_blkdev_sb(sb))
return inode->i_mapping->backing_dev_info;
return sb->s_bdi;
@@ -92,18 +89,14 @@ static inline struct inode *wb_inode(struct list_head *head)
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
+
+static void bdi_wakeup_thread(struct backing_dev_info *bdi)
{
- if (bdi->wb.task) {
- wake_up_process(bdi->wb.task);
- } else {
- /*
- * The bdi thread isn't there, wake up the forker thread which
- * will create and run it.
- */
- wake_up_process(default_backing_dev_info.wb.task);
- }
+ spin_lock_bh(&bdi->wb_lock);
+ if (test_bit(BDI_registered, &bdi->state))
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ spin_unlock_bh(&bdi->wb_lock);
}
static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -112,10 +105,14 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
trace_writeback_queue(bdi, work);
spin_lock_bh(&bdi->wb_lock);
+ if (!test_bit(BDI_registered, &bdi->state)) {
+ if (work->done)
+ complete(work->done);
+ goto out_unlock;
+ }
list_add_tail(&work->list, &bdi->work_list);
- if (!bdi->wb.task)
- trace_writeback_nothread(bdi, work);
- bdi_wakeup_flusher(bdi);
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+out_unlock:
spin_unlock_bh(&bdi->wb_lock);
}
@@ -131,10 +128,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- if (bdi->wb.task) {
- trace_writeback_nowork(bdi);
- wake_up_process(bdi->wb.task);
- }
+ trace_writeback_nowork(bdi);
+ bdi_wakeup_thread(bdi);
return;
}
@@ -181,9 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(bdi);
- spin_lock_bh(&bdi->wb_lock);
- bdi_wakeup_flusher(bdi);
- spin_unlock_bh(&bdi->wb_lock);
+ bdi_wakeup_thread(bdi);
}
/*
@@ -231,11 +224,10 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
static void inode_sync_complete(struct inode *inode)
{
- /*
- * Prevent speculative execution through
- * spin_unlock(&wb->list_lock);
- */
-
+ inode->i_state &= ~I_SYNC;
+ /* If inode is clean an unused, put it into LRU now... */
+ inode_add_lru(inode);
+ /* Waiters must see I_SYNC cleared before being woken up */
smp_mb();
wake_up_bit(&inode->i_state, __I_SYNC);
}
@@ -256,7 +248,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
}
/*
- * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
+ * Move expired (dirtied before work->older_than_this) dirty inodes from
+ * @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
@@ -274,11 +267,13 @@ static int move_expired_inodes(struct list_head *delaying_queue,
if (work->older_than_this &&
inode_dirtied_after(inode, *work->older_than_this))
break;
+ list_move(&inode->i_wb_list, &tmp);
+ moved++;
+ if (sb_is_blkdev_sb(inode->i_sb))
+ continue;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
- list_move(&inode->i_wb_list, &tmp);
- moved++;
}
/* just one sb in list, splice to dispatch_queue and we're done */
@@ -322,16 +317,24 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, wbc);
+ int ret;
+
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+ trace_writeback_write_inode_start(inode, wbc);
+ ret = inode->i_sb->s_op->write_inode(inode, wbc);
+ trace_writeback_write_inode(inode, wbc);
+ return ret;
+ }
return 0;
}
/*
- * Wait for writeback on an inode to complete.
+ * Wait for writeback on an inode to complete. Called with i_lock held.
+ * Caller must make sure inode cannot go away when we drop i_lock.
*/
-static void inode_wait_for_writeback(struct inode *inode,
- struct bdi_writeback *wb)
+static void __inode_wait_for_writeback(struct inode *inode)
+ __releases(inode->i_lock)
+ __acquires(inode->i_lock)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;
@@ -339,79 +342,131 @@ static void inode_wait_for_writeback(struct inode *inode,
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
- spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
}
}
/*
- * Write out an inode's dirty pages. Called under wb->list_lock and
- * inode->i_lock. Either the caller has an active reference on the inode or
- * the inode has I_WILL_FREE set.
- *
- * If `wait' is set, wait on the writeout.
- *
- * The whole writeout design is quite complex and fragile. We want to avoid
- * starvation of particular inodes when others are being redirtied, prevent
- * livelocks, etc.
+ * Wait for writeback on an inode to complete. Caller must have inode pinned.
*/
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+void inode_wait_for_writeback(struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
- long nr_to_write = wbc->nr_to_write;
- unsigned dirty;
- int ret;
+ spin_lock(&inode->i_lock);
+ __inode_wait_for_writeback(inode);
+ spin_unlock(&inode->i_lock);
+}
- assert_spin_locked(&wb->list_lock);
- assert_spin_locked(&inode->i_lock);
+/*
+ * Sleep until I_SYNC is cleared. This function must be called with i_lock
+ * held and drops it. It is aimed for callers not holding any inode reference
+ * so once i_lock is dropped, inode can go away.
+ */
+static void inode_sleep_on_writeback(struct inode *inode)
+ __releases(inode->i_lock)
+{
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+ int sleep;
- if (!atomic_read(&inode->i_count))
- WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
- else
- WARN_ON(inode->i_state & I_WILL_FREE);
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ sleep = inode->i_state & I_SYNC;
+ spin_unlock(&inode->i_lock);
+ if (sleep)
+ schedule();
+ finish_wait(wqh, &wait);
+}
- if (inode->i_state & I_SYNC) {
+/*
+ * Find proper writeback list for the inode depending on its current state and
+ * possibly also change of its state while we were doing writeback. Here we
+ * handle things such as livelock prevention or fairness of writeback among
+ * inodes. This function can be called only by flusher thread - noone else
+ * processes all inodes in writeback lists and requeueing inodes behind flusher
+ * thread's back can have unexpected consequences.
+ */
+static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
+ struct writeback_control *wbc)
+{
+ if (inode->i_state & I_FREEING)
+ return;
+
+ /*
+ * Sync livelock prevention. Each inode is tagged and synced in one
+ * shot. If still dirty, it will be redirty_tail()'ed below. Update
+ * the dirty time to prevent enqueue and sync it again.
+ */
+ if ((inode->i_state & I_DIRTY) &&
+ (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+ inode->dirtied_when = jiffies;
+
+ if (wbc->pages_skipped) {
/*
- * If this inode is locked for writeback and we are not doing
- * writeback-for-data-integrity, move it to b_more_io so that
- * writeback can proceed with the other inodes on s_io.
- *
- * We'll have another go at writing back this inode when we
- * completed a full scan of b_io.
+ * writeback is not making progress due to locked
+ * buffers. Skip this inode for now.
+ */
+ redirty_tail(inode, wb);
+ return;
+ }
+
+ if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ /*
+ * We didn't write back all the pages. nfs_writepages()
+ * sometimes bales out without doing anything.
*/
- if (wbc->sync_mode != WB_SYNC_ALL) {
+ if (wbc->nr_to_write <= 0) {
+ /* Slice used up. Queue for next turn. */
requeue_io(inode, wb);
- trace_writeback_single_inode_requeue(inode, wbc,
- nr_to_write);
- return 0;
+ } else {
+ /*
+ * Writeback blocked by something other than
+ * congestion. Delay the inode for some time to
+ * avoid spinning on the CPU (100% iowait)
+ * retrying writeback of the dirty page/inode
+ * that cannot be performed immediately.
+ */
+ redirty_tail(inode, wb);
}
-
+ } else if (inode->i_state & I_DIRTY) {
/*
- * It's a data-integrity sync. We must wait.
+ * Filesystems can dirty the inode during writeback operations,
+ * such as delayed allocation during submission or metadata
+ * updates after data IO completion.
*/
- inode_wait_for_writeback(inode, wb);
+ redirty_tail(inode, wb);
+ } else {
+ /* The inode is clean. Remove from writeback lists. */
+ list_del_init(&inode->i_wb_list);
}
+}
- BUG_ON(inode->i_state & I_SYNC);
+/*
+ * Write out an inode and its dirty pages. Do not update the writeback list
+ * linkage. That is left to the caller. The caller is also responsible for
+ * setting I_SYNC flag and calling inode_sync_complete() to clear it.
+ */
+static int
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct address_space *mapping = inode->i_mapping;
+ long nr_to_write = wbc->nr_to_write;
+ unsigned dirty;
+ int ret;
- /* Set I_SYNC, reset I_DIRTY_PAGES */
- inode->i_state |= I_SYNC;
- inode->i_state &= ~I_DIRTY_PAGES;
- spin_unlock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
+ WARN_ON(!(inode->i_state & I_SYNC));
+
+ trace_writeback_single_inode_start(inode, wbc, nr_to_write);
ret = do_writepages(mapping, wbc);
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
- * I/O completion.
+ * I/O completion. We don't do it for sync(2) writeback because it has a
+ * separate, external IO completion path and ->sync_fs for guaranteeing
+ * inode metadata is written back correctly.
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
@@ -423,6 +478,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* write_inode()
*/
spin_lock(&inode->i_lock);
+ /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ inode->i_state &= ~I_DIRTY_PAGES;
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
spin_unlock(&inode->i_lock);
@@ -432,60 +490,70 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
if (ret == 0)
ret = err;
}
+ trace_writeback_single_inode(inode, wbc, nr_to_write);
+ return ret;
+}
+
+/*
+ * Write out an inode's dirty pages. Either the caller has an active reference
+ * on the inode or the inode has I_WILL_FREE set.
+ *
+ * This function is designed to be called for writing back one inode which
+ * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
+ * and does more profound writeback list handling in writeback_sb_inodes().
+ */
+static int
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
- spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_SYNC;
- if (!(inode->i_state & I_FREEING)) {
+ if (!atomic_read(&inode->i_count))
+ WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+ else
+ WARN_ON(inode->i_state & I_WILL_FREE);
+
+ if (inode->i_state & I_SYNC) {
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ goto out;
/*
- * Sync livelock prevention. Each inode is tagged and synced in
- * one shot. If still dirty, it will be redirty_tail()'ed below.
- * Update the dirty time to prevent enqueue and sync it again.
+ * It's a data-integrity sync. We must wait. Since callers hold
+ * inode reference or inode has I_WILL_FREE set, it cannot go
+ * away under us.
*/
- if ((inode->i_state & I_DIRTY) &&
- (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
- inode->dirtied_when = jiffies;
-
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- /*
- * We didn't write back all the pages. nfs_writepages()
- * sometimes bales out without doing anything.
- */
- inode->i_state |= I_DIRTY_PAGES;
- if (wbc->nr_to_write <= 0) {
- /*
- * slice used up: queue for next turn
- */
- requeue_io(inode, wb);
- } else {
- /*
- * Writeback blocked by something other than
- * congestion. Delay the inode for some time to
- * avoid spinning on the CPU (100% iowait)
- * retrying writeback of the dirty page/inode
- * that cannot be performed immediately.
- */
- redirty_tail(inode, wb);
- }
- } else if (inode->i_state & I_DIRTY) {
- /*
- * Filesystems can dirty the inode during writeback
- * operations, such as delayed allocation during
- * submission or metadata updates after data IO
- * completion.
- */
- redirty_tail(inode, wb);
- } else {
- /*
- * The inode is clean. At this point we either have
- * a reference to the inode or it's on it's way out.
- * No need to add it back to the LRU.
- */
- list_del_init(&inode->i_wb_list);
- }
+ __inode_wait_for_writeback(inode);
}
+ WARN_ON(inode->i_state & I_SYNC);
+ /*
+ * Skip inode if it is clean and we have no outstanding writeback in
+ * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
+ * function since flusher thread may be doing for example sync in
+ * parallel and if we move the inode, it could get skipped. So here we
+ * make sure inode is on some writeback list and leave it there unless
+ * we have completely cleaned the inode.
+ */
+ if (!(inode->i_state & I_DIRTY) &&
+ (wbc->sync_mode != WB_SYNC_ALL ||
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
+ goto out;
+ inode->i_state |= I_SYNC;
+ spin_unlock(&inode->i_lock);
+
+ ret = __writeback_single_inode(inode, wbc);
+
+ spin_lock(&wb->list_lock);
+ spin_lock(&inode->i_lock);
+ /*
+ * If inode is clean, remove it from writeback lists. Otherwise don't
+ * touch it. See comment above for explanation.
+ */
+ if (!(inode->i_state & I_DIRTY))
+ list_del_init(&inode->i_wb_list);
+ spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
- trace_writeback_single_inode(inode, wbc, nr_to_write);
+out:
+ spin_unlock(&inode->i_lock);
return ret;
}
@@ -523,10 +591,6 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
/*
* Write a portion of b_io inodes which belong to @sb.
*
- * If @only_this_sb is true, then find and write all such
- * inodes. Otherwise write only ones which go sequentially
- * in reverse order.
- *
* Return the number of pages and/or inodes written.
*/
static long writeback_sb_inodes(struct super_block *sb,
@@ -538,6 +602,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
+ .for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -569,8 +634,8 @@ static long writeback_sb_inodes(struct super_block *sb,
}
/*
- * Don't bother with new inodes or inodes beeing freed, first
- * kind does not need peridic writeout yet, and for the latter
+ * Don't bother with new inodes or inodes being freed, first
+ * kind does not need periodic writeout yet, and for the latter
* kind writeout is handled by the freer.
*/
spin_lock(&inode->i_lock);
@@ -579,29 +644,58 @@ static long writeback_sb_inodes(struct super_block *sb,
redirty_tail(inode, wb);
continue;
}
- __iget(inode);
+ if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+ /*
+ * If this inode is locked for writeback and we are not
+ * doing writeback-for-data-integrity, move it to
+ * b_more_io so that writeback can proceed with the
+ * other inodes on s_io.
+ *
+ * We'll have another go at writing back this inode
+ * when we completed a full scan of b_io.
+ */
+ spin_unlock(&inode->i_lock);
+ requeue_io(inode, wb);
+ trace_writeback_sb_inodes_requeue(inode);
+ continue;
+ }
+ spin_unlock(&wb->list_lock);
+
+ /*
+ * We already requeued the inode if it had I_SYNC set and we
+ * are doing WB_SYNC_NONE writeback. So this catches only the
+ * WB_SYNC_ALL case.
+ */
+ if (inode->i_state & I_SYNC) {
+ /* Wait for I_SYNC. This function drops i_lock... */
+ inode_sleep_on_writeback(inode);
+ /* Inode may be gone, start again */
+ spin_lock(&wb->list_lock);
+ continue;
+ }
+ inode->i_state |= I_SYNC;
+ spin_unlock(&inode->i_lock);
+
write_chunk = writeback_chunk_size(wb->bdi, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
- writeback_single_inode(inode, wb, &wbc);
+ /*
+ * We use I_SYNC to pin the inode in memory. While it is set
+ * evict_inode() will wait so the inode cannot be freed.
+ */
+ __writeback_single_inode(inode, &wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
+ spin_lock(&wb->list_lock);
+ spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY))
wrote++;
- if (wbc.pages_skipped) {
- /*
- * writeback is not making progress due to locked
- * buffers. Skip this inode for now.
- */
- redirty_tail(inode, wb);
- }
+ requeue_inode(inode, wb, &wbc);
+ inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
- iput(inode);
- cond_resched();
- spin_lock(&wb->list_lock);
+ cond_resched_lock(&wb->list_lock);
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -650,7 +744,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
return wrote;
}
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
enum wb_reason reason)
{
struct wb_writeback_work work = {
@@ -795,8 +889,10 @@ static long wb_writeback(struct bdi_writeback *wb,
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
- inode_wait_for_writeback(inode, wb);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&wb->list_lock);
+ /* This function drops i_lock... */
+ inode_sleep_on_writeback(inode);
+ spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);
@@ -888,7 +984,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
/*
* Retrieve work items and do the writeback they describe
*/
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
@@ -896,12 +992,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
- /*
- * Override sync mode, in case we must wait for completion
- * because this thread is exiting now.
- */
- if (force_wait)
- work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
@@ -929,66 +1019,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
*/
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
{
- struct bdi_writeback *wb = data;
+ struct bdi_writeback *wb = container_of(to_delayed_work(work),
+ struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
+ set_worker_desc("flush-%s", dev_name(bdi->dev));
current->flags |= PF_SWAPWRITE;
- set_freezable();
- wb->last_active = jiffies;
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(current, 0);
- trace_writeback_thread_start(bdi);
-
- while (!kthread_freezable_should_stop(NULL)) {
+ if (likely(!current_is_workqueue_rescuer() ||
+ !test_bit(BDI_registered, &bdi->state))) {
/*
- * Remove own delayed wake-up timer, since we are already awake
- * and we'll take care of the preriodic write-back.
+ * The normal path. Keep writing back @bdi until its
+ * work_list is empty. Note that this path is also taken
+ * if @bdi is shutting down even when we're running off the
+ * rescuer as work_list needs to be drained.
*/
- del_timer(&wb->wakeup_timer);
-
- pages_written = wb_do_writeback(wb, 0);
-
+ do {
+ pages_written = wb_do_writeback(wb);
+ trace_writeback_pages_written(pages_written);
+ } while (!list_empty(&bdi->work_list));
+ } else {
+ /*
+ * bdi_wq can't get enough workers and we're running off
+ * the emergency worker. Don't hog it. Hopefully, 1024 is
+ * enough for efficient IO.
+ */
+ pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
-
- if (pages_written)
- wb->last_active = jiffies;
-
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- continue;
- }
-
- if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
- else {
- /*
- * We have nothing to do, so can go sleep without any
- * timeout and save power. When a work is queued or
- * something is made dirty - we will be woken up.
- */
- schedule();
- }
}
- /* Flush any work that raced with us exiting */
if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+ bdi_wakeup_thread_delayed(bdi);
- trace_writeback_thread_stop(bdi);
- return 0;
+ current->flags &= ~PF_SWAPWRITE;
}
-
/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world.
@@ -997,10 +1070,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
struct backing_dev_info *bdi;
- if (!nr_pages) {
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- }
+ if (!nr_pages)
+ nr_pages = get_nr_dirty_pages();
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
@@ -1067,8 +1138,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* dirty the inode itself
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ trace_writeback_dirty_inode_start(inode, flags);
+
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
+
+ trace_writeback_dirty_inode(inode, flags);
}
/*
@@ -1117,6 +1192,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
bool wakeup_bdi = false;
bdi = inode_to_bdi(inode);
+ spin_unlock(&inode->i_lock);
+ spin_lock(&bdi->wb.list_lock);
if (bdi_cap_writeback_dirty(bdi)) {
WARN(!test_bit(BDI_registered, &bdi->state),
"bdi-%s not registered\n", bdi->name);
@@ -1131,8 +1208,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wakeup_bdi = true;
}
- spin_unlock(&inode->i_lock);
- spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
@@ -1148,23 +1223,6 @@ out_unlock_inode:
}
EXPORT_SYMBOL(__mark_inode_dirty);
-/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * The inodes to be written are parked on bdi->b_io. They are moved back onto
- * bdi->b_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -1242,6 +1300,8 @@ void writeback_inodes_sb_nr(struct super_block *sb,
.reason = reason,
};
+ if (sb->s_bdi == &noop_backing_dev_info)
+ return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
@@ -1264,47 +1324,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
EXPORT_SYMBOL(writeback_inodes_sb);
/**
- * writeback_inodes_sb_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
* @sb: the superblock
- * @reason: reason why some writeback work was initiated
+ * @nr: the number of pages to write
+ * @reason: the reason of writeback
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+ unsigned long nr,
+ enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb(sb, reason);
- up_read(&sb->s_umount);
+ if (writeback_in_progress(sb->s_bdi))
return 1;
- } else
+
+ if (!down_read_trylock(&sb->s_umount))
return 0;
+
+ writeback_inodes_sb_nr(sb, nr, reason);
+ up_read(&sb->s_umount);
+ return 1;
}
-EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
/**
- * writeback_inodes_sb_if_idle - start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
* @sb: the superblock
- * @nr: the number of pages to write
* @reason: reason why some writeback work was initiated
*
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
- if (!writeback_in_progress(sb->s_bdi)) {
- down_read(&sb->s_umount);
- writeback_inodes_sb_nr(sb, nr, reason);
- up_read(&sb->s_umount);
- return 1;
- } else
- return 0;
+ return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
-EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb);
/**
* sync_inodes_sb - sync sb inode pages
@@ -1323,8 +1379,12 @@ void sync_inodes_sb(struct super_block *sb)
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
+ .for_sync = 1,
};
+ /* Nothing to do? */
+ if (sb->s_bdi == &noop_backing_dev_info)
+ return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
bdi_queue_work(sb->s_bdi, &work);
@@ -1347,7 +1407,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
int write_inode_now(struct inode *inode, int sync)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- int ret;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -1359,14 +1418,7 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- spin_lock(&wb->list_lock);
- spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, wb, &wbc);
- spin_unlock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
- if (sync)
- inode_sync_wait(inode);
- return ret;
+ return writeback_single_inode(inode, wb, &wbc);
}
EXPORT_SYMBOL(write_inode_now);
@@ -1383,15 +1435,7 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- int ret;
-
- spin_lock(&wb->list_lock);
- spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, wb, wbc);
- spin_unlock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
- return ret;
+ return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
}
EXPORT_SYMBOL(sync_inode);