From 0c38a2512df272b14ef4238b476a2e4f70da1479 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 25 Aug 2011 07:17:01 +0000
Subject: xfs: don't serialise direct IO reads on page cache checks

There is no need to grab the i_mutex of the IO lock in exclusive
mode if we don't need to invalidate the page cache. Taking these
locks on every direct IO effective serialises them as taking the IO
lock in exclusive mode has to wait for all shared holders to drop
the lock. That only happens when IO is complete, so effective it
prevents dispatch of concurrent direct IO reads to the same inode.

Fix this by taking the IO lock shared to check the page cache state,
and only then drop it and take the IO lock exclusively if there is
work to be done. Hence for the normal direct IO case, no exclusive
locking will occur.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Joern Engel <joern@logfs.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea..8fd4a0708d3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -317,7 +317,19 @@ xfs_file_aio_read(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (unlikely(ioflags & IO_ISDIRECT)) {
+	/*
+	 * Locking is a bit tricky here. If we take an exclusive lock
+	 * for direct IO, we effectively serialise all new concurrent
+	 * read IO to this file and block it behind IO that is currently in
+	 * progress because IO in progress holds the IO lock shared. We only
+	 * need to hold the lock exclusive to blow away the page cache, so
+	 * only take lock exclusively if the page cache needs invalidation.
+	 * This allows the normal direct IO case of no page cache pages to
+	 * proceeed concurrently without serialisation.
+	 */
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
 		if (inode->i_mapping->nrpages) {
@@ -330,8 +342,7 @@ xfs_file_aio_read(
 			}
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-	} else
-		xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	}
 
 	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 
-- 
cgit v1.2.3-18-g5258


From 7271d243f9d1b4106289e4cf876c8b1203de59ab Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 25 Aug 2011 07:17:02 +0000
Subject: xfs: don't serialise adjacent concurrent direct IO appending writes

For append write workloads, extending the file requires a certain
amount of exclusive locking to be done up front to ensure sanity in
things like ensuring that we've zeroed any allocated regions
between the old EOF and the start of the new IO.

For single threads, this typically isn't a problem, and for large
IOs we don't serialise enough for it to be a problem for two
threads on really fast block devices. However for smaller IO and
larger thread counts we have a problem.

Take 4 concurrent sequential, single block sized and aligned IOs.
After the first IO is submitted but before it completes, we end up
with this state:

        IO 1    IO 2    IO 3    IO 4
      +-------+-------+-------+-------+
      ^       ^
      |       |
      |       |
      |       |
      |       \- ip->i_new_size
      \- ip->i_size

And the IO is done without exclusive locking because offset <=
ip->i_size. When we submit IO 2, we see offset > ip->i_size, and
grab the IO lock exclusive, because there is a chance we need to do
EOF zeroing. However, there is already an IO in progress that avoids
the need for IO zeroing because offset <= ip->i_new_size. hence we
could avoid holding the IO lock exlcusive for this. Hence after
submission of the second IO, we'd end up this state:

        IO 1    IO 2    IO 3    IO 4
      +-------+-------+-------+-------+
      ^               ^
      |               |
      |               |
      |               |
      |               \- ip->i_new_size
      \- ip->i_size

There is no need to grab the i_mutex of the IO lock in exclusive
mode if we don't need to invalidate the page cache. Taking these
locks on every direct IO effective serialises them as taking the IO
lock in exclusive mode has to wait for all shared holders to drop
the lock. That only happens when IO is complete, so effective it
prevents dispatch of concurrent direct IO writes to the same inode.

And so you can see that for the third concurrent IO, we'd avoid
exclusive locking for the same reason we avoided the exclusive lock
for the second IO.

Fixing this is a bit more complex than that, because we need to hold
a write-submission local value of ip->i_new_size to that clearing
the value is only done if no other thread has updated it before our
IO completes.....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 68 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 16 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8fd4a0708d3..cbbac5cc9c2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -418,11 +418,13 @@ xfs_aio_write_isize_update(
  */
 STATIC void
 xfs_aio_write_newsize_update(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	xfs_fsize_t		new_size)
 {
-	if (ip->i_new_size) {
+	if (new_size == ip->i_new_size) {
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
+		if (new_size == ip->i_new_size)
+			ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
 			ip->i_d.di_size = ip->i_size;
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -473,7 +475,7 @@ xfs_file_splice_write(
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 
 	xfs_aio_write_isize_update(inode, ppos, ret);
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@@ -670,6 +672,7 @@ xfs_file_aio_write_checks(
 	struct file		*file,
 	loff_t			*pos,
 	size_t			*count,
+	xfs_fsize_t		*new_sizep,
 	int			*iolock)
 {
 	struct inode		*inode = file->f_mapping->host;
@@ -677,6 +680,8 @@ xfs_file_aio_write_checks(
 	xfs_fsize_t		new_size;
 	int			error = 0;
 
+	*new_sizep = 0;
+restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 	if (error) {
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -684,20 +689,41 @@ xfs_file_aio_write_checks(
 		return error;
 	}
 
-	new_size = *pos + *count;
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-
 	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
 		file_update_time(file);
 
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
-	 * write.
+	 * write. There is no need to issue zeroing if another in-flght IO ends
+	 * at or before this one If zeronig is needed and we are currently
+	 * holding the iolock shared, we need to update it to exclusive which
+	 * involves dropping all locks and relocking to maintain correct locking
+	 * order. If we do this, restart the function to ensure all checks and
+	 * values are still valid.
 	 */
-	if (*pos > ip->i_size)
+	if ((ip->i_new_size && *pos > ip->i_new_size) ||
+	    (!ip->i_new_size && *pos > ip->i_size)) {
+		if (*iolock == XFS_IOLOCK_SHARED) {
+			xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+			*iolock = XFS_IOLOCK_EXCL;
+			xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+			goto restart;
+		}
 		error = -xfs_zero_eof(ip, *pos, ip->i_size);
+	}
+
+	/*
+	 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+	 * We have already zeroed space beyond EOF (if necessary).  Only update
+	 * ip->i_new_size if this IO ends beyond any other in-flight writes.
+	 */
+	new_size = *pos + *count;
+	if (new_size > ip->i_size) {
+		if (new_size > ip->i_new_size)
+			ip->i_new_size = new_size;
+		*new_sizep = new_size;
+	}
 
 	xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error)
@@ -744,6 +770,7 @@ xfs_file_dio_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -764,13 +791,20 @@ xfs_file_dio_aio_write(
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
 		unaligned_io = 1;
 
-	if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+	/*
+	 * We don't need to take an exclusive lock unless there page cache needs
+	 * to be invalidated or unaligned IO is being executed. We don't need to
+	 * consider the EOF extension case here because
+	 * xfs_file_aio_write_checks() will relock the inode as necessary for
+	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 */
+	if (unaligned_io || mapping->nrpages)
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
@@ -809,6 +843,7 @@ xfs_file_buffered_aio_write(
 	unsigned long		nr_segs,
 	loff_t			pos,
 	size_t			ocount,
+	xfs_fsize_t		*new_size,
 	int			*iolock)
 {
 	struct file		*file = iocb->ki_filp;
@@ -822,7 +857,7 @@ xfs_file_buffered_aio_write(
 	*iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
 
-	ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
@@ -862,6 +897,7 @@ xfs_file_aio_write(
 	ssize_t			ret;
 	int			iolock;
 	size_t			ocount = 0;
+	xfs_fsize_t		new_size = 0;
 
 	XFS_STATS_INC(xs_write_calls);
 
@@ -881,10 +917,10 @@ xfs_file_aio_write(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-						ocount, &iolock);
+						ocount, &new_size, &iolock);
 
 	xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
 
@@ -905,7 +941,7 @@ xfs_file_aio_write(
 	}
 
 out_unlock:
-	xfs_aio_write_newsize_update(ip);
+	xfs_aio_write_newsize_update(ip, new_size);
 	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
-- 
cgit v1.2.3-18-g5258


From 375ec69d2ef6e0797f19f5823e36e249765c3d41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:03 +0000
Subject: xfs: remove delwri buffer handling from xfs_buf_iorequest

We cannot ever reach xfs_buf_iorequest for a buffer with XBF_DELWRI set,
given that all write handlers make sure that the buffer is remove from
the delwri queue before, and we never do reads with the XBF_DELWRI flag
set (which the code would not handle correctly anyway).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c57836dc778..2e71a26da22 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1275,15 +1275,10 @@ xfs_buf_iorequest(
 {
 	trace_xfs_buf_iorequest(bp, _RET_IP_);
 
-	if (bp->b_flags & XBF_DELWRI) {
-		xfs_buf_delwri_queue(bp, 1);
-		return 0;
-	}
+	ASSERT(!(bp->b_flags & XBF_DELWRI));
 
-	if (bp->b_flags & XBF_WRITE) {
+	if (bp->b_flags & XBF_WRITE)
 		xfs_buf_wait_unpin(bp);
-	}
-
 	xfs_buf_hold(bp);
 
 	/* Set the count to 1 initially, this will stop an I/O
-- 
cgit v1.2.3-18-g5258


From 527cfdf19dd538a5a9e46b9bed0f30a38c28438d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:04 +0000
Subject: xfs: remove the unlock argument to xfs_buf_delwri_queue

We can just unlock the buffer in the caller, and the decrement of b_hold
would also be needed in the !unlock, we just never hit that case currently
given that the caller handles that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 2e71a26da22..04689dbbcbb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,7 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *);
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -940,7 +940,7 @@ xfs_buf_unlock(
 	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
 		atomic_inc(&bp->b_hold);
 		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_delwri_queue(bp, 0);
+		xfs_buf_delwri_queue(bp);
 	}
 
 	XB_CLEAR_OWNER(bp);
@@ -1049,7 +1049,8 @@ xfs_bdwrite(
 	bp->b_flags &= ~XBF_READ;
 	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
 
-	xfs_buf_delwri_queue(bp, 1);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_unlock(bp);
 }
 
 /*
@@ -1562,8 +1563,7 @@ error:
  */
 STATIC void
 xfs_buf_delwri_queue(
-	xfs_buf_t		*bp,
-	int			unlock)
+	xfs_buf_t		*bp)
 {
 	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
@@ -1576,8 +1576,7 @@ xfs_buf_delwri_queue(
 	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&bp->b_list)) {
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		if (unlock)
-			atomic_dec(&bp->b_hold);
+		atomic_dec(&bp->b_hold);
 		list_del(&bp->b_list);
 	}
 
@@ -1590,9 +1589,6 @@ xfs_buf_delwri_queue(
 	list_add_tail(&bp->b_list, dwq);
 	bp->b_queuetime = jiffies;
 	spin_unlock(dwlk);
-
-	if (unlock)
-		xfs_buf_unlock(bp);
 }
 
 void
-- 
cgit v1.2.3-18-g5258


From 5a8ee6bafdd0ab8555adceac8b2cec539a552a1f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:05 +0000
Subject: xfs: move more delwri setup into xfs_buf_delwri_queue

Do not transfer a reference held by the caller to the buffer on the list,
or decrement it in xfs_buf_delwri_queue, but instead grab a new reference
if needed, and let the caller drop its own reference.  Also move setting
of the XBF_DELWRI and XBF_ASYNC flags into xfs_buf_delwri_queue, and
only do it if needed.  Note that for now xfs_buf_unlock already has
XBF_DELWRI, but that will change in the following patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 04689dbbcbb..86c0945053c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -937,11 +937,8 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-		atomic_inc(&bp->b_hold);
-		bp->b_flags |= XBF_ASYNC;
+	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI)
 		xfs_buf_delwri_queue(bp);
-	}
 
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
@@ -1046,11 +1043,8 @@ xfs_bdwrite(
 {
 	trace_xfs_buf_bdwrite(bp, _RET_IP_);
 
-	bp->b_flags &= ~XBF_READ;
-	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-
 	xfs_buf_delwri_queue(bp);
-	xfs_buf_unlock(bp);
+	xfs_buf_relse(bp);
 }
 
 /*
@@ -1570,23 +1564,22 @@ xfs_buf_delwri_queue(
 
 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
 
-	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+	ASSERT(!(bp->b_flags & XBF_READ));
 
 	spin_lock(dwlk);
-	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&bp->b_list)) {
+		/* if already in the queue, move it to the tail */
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		atomic_dec(&bp->b_hold);
-		list_del(&bp->b_list);
-	}
-
-	if (list_empty(dwq)) {
+		list_move_tail(&bp->b_list, dwq);
+	} else {
 		/* start xfsbufd as it is about to have something to do */
-		wake_up_process(bp->b_target->bt_task);
-	}
+		if (list_empty(dwq))
+			wake_up_process(bp->b_target->bt_task);
 
-	bp->b_flags |= _XBF_DELWRI_Q;
-	list_add_tail(&bp->b_list, dwq);
+		atomic_inc(&bp->b_hold);
+		bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
+		list_add_tail(&bp->b_list, dwq);
+	}
 	bp->b_queuetime = jiffies;
 	spin_unlock(dwlk);
 }
-- 
cgit v1.2.3-18-g5258


From 61551f1ee536289084a4a8f1c4f187e2f371c440 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:06 +0000
Subject: xfs: call xfs_buf_delwri_queue directly

Unify the ways we add buffers to the delwri queue by always calling
xfs_buf_delwri_queue directly.  The xfs_bdwrite functions is removed and
opencoded in its callers, and the two places setting XBF_DELWRI while a
buffer is locked and expecting xfs_buf_unlock to pick it up are converted
to call xfs_buf_delwri_queue directly, too.  Also replace the
XFS_BUF_UNDELAYWRITE macro with direct calls to xfs_buf_delwri_dequeue
to make the explicit queuing/dequeuing more obvious.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        |  2 +-
 fs/xfs/xfs_buf.c         | 21 +++------------------
 fs/xfs/xfs_buf.h         |  8 +++-----
 fs/xfs/xfs_buf_item.c    |  4 ++--
 fs/xfs/xfs_dquot.c       |  6 ++++--
 fs/xfs/xfs_inode.c       |  6 ++++--
 fs/xfs/xfs_log_recover.c |  9 ++++++---
 fs/xfs/xfs_mount.c       |  2 +-
 fs/xfs/xfs_qm.c          |  3 ++-
 fs/xfs/xfs_rw.c          |  2 +-
 fs/xfs/xfs_trace.h       |  1 -
 fs/xfs/xfs_trans_buf.c   |  5 +++--
 12 files changed, 30 insertions(+), 39 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 160bcdc34a6..b097fd5a2b3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2189,7 +2189,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
 			XFS_BUF_STALE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_delwri_dequeue(bp);
 			xfs_buf_relse(bp);
 			bp = NULL;
 		}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 86c0945053c..309eca75fad 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *);
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -937,9 +936,6 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI)
-		xfs_buf_delwri_queue(bp);
-
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
 
@@ -1036,17 +1032,6 @@ xfs_bwrite(
 	return error;
 }
 
-void
-xfs_bdwrite(
-	void			*mp,
-	struct xfs_buf		*bp)
-{
-	trace_xfs_buf_bdwrite(bp, _RET_IP_);
-
-	xfs_buf_delwri_queue(bp);
-	xfs_buf_relse(bp);
-}
-
 /*
  * Called when we want to stop a buffer from getting written or read.
  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1069,7 +1054,7 @@ xfs_bioerror(
 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_UNDONE(bp);
 	XFS_BUF_STALE(bp);
 
@@ -1098,7 +1083,7 @@ xfs_bioerror_relse(
 	 * change that interface.
 	 */
 	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_DONE(bp);
 	XFS_BUF_STALE(bp);
 	bp->b_iodone = NULL;
@@ -1555,7 +1540,7 @@ error:
 /*
  *	Delayed write buffer handling
  */
-STATIC void
+void
 xfs_buf_delwri_queue(
 	xfs_buf_t		*bp)
 {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 620972b8094..f1a8933becb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -198,7 +198,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
@@ -221,8 +220,9 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
 /* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
+extern void xfs_buf_delwri_queue(struct xfs_buf *);
+extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
+extern void xfs_buf_delwri_promote(struct xfs_buf *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
@@ -251,8 +251,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
 
 #define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa674..3243083d869 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -992,7 +992,7 @@ xfs_buf_iodone_callbacks(
 		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
 
 		if (!XFS_BUF_ISSTALE(bp)) {
-			XFS_BUF_DELAYWRITE(bp);
+			xfs_buf_delwri_queue(bp);
 			XFS_BUF_DONE(bp);
 		}
 		ASSERT(bp->b_iodone != NULL);
@@ -1007,7 +1007,7 @@ xfs_buf_iodone_callbacks(
 	 */
 	XFS_BUF_STALE(bp);
 	XFS_BUF_DONE(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index db62959bed1..0f78dd46415 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1243,8 +1243,10 @@ xfs_qm_dqflush(
 
 	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	else
-		xfs_bdwrite(mp, bp);
+	else {
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
+	}
 
 	trace_xfs_dqflush_done(dqp);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0239a7c7c88..f8fe1c66d42 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2598,8 +2598,10 @@ xfs_iflush(
 
 	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	else
-		xfs_bdwrite(mp, bp);
+	else {
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
+	}
 	return error;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a199dbcee7d..22946949bf5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2176,7 +2176,8 @@ xlog_recover_buffer_pass2(
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
 	}
 
 	return (error);
@@ -2439,7 +2440,8 @@ xlog_recover_inode_pass2(
 write_inode_buffer:
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 error:
 	if (need_free)
 		kmem_free(in_f);
@@ -2561,7 +2563,8 @@ xlog_recover_dquot_pass2(
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
-	xfs_bdwrite(mp, bp);
+	xfs_buf_delwri_queue(bp);
+	xfs_buf_relse(bp);
 
 	return (0);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0081657ad98..a957e437bee 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1612,7 +1612,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 
 		XFS_BUF_UNDONE(sbp);
 		XFS_BUF_UNREAD(sbp);
-		XFS_BUF_UNDELAYWRITE(sbp);
+		xfs_buf_delwri_dequeue(sbp);
 		XFS_BUF_WRITE(sbp);
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(sbp->b_target == mp->m_ddev_targp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9a0aa76facd..f51bef885e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
 			break;
 
 		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-		xfs_bdwrite(mp, bp);
+		xfs_buf_delwri_queue(bp);
+		xfs_buf_relse(bp);
 		/*
 		 * goto the next block.
 		 */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c96a8a05ac0..99823c3b9ac 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -149,7 +149,7 @@ xfs_read_buf(
 		}
 		if (bp) {
 			XFS_BUF_UNDONE(bp);
-			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_delwri_dequeue(bp);
 			XFS_BUF_STALE(bp);
 			/*
 			 * brelse clears B_ERROR and b_error
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd7..bb5e660e0fa 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -320,7 +320,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 137e2b9e294..5e5196a269d 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 * inside the b_bdstrat callback so that this won't get written to
 	 * disk.
 	 */
-	XFS_BUF_DELAYWRITE(bp);
 	XFS_BUF_DONE(bp);
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bp->b_iodone = xfs_buf_iodone_callbacks;
 	bip->bli_item.li_cb = xfs_buf_iodone;
 
+	xfs_buf_delwri_queue(bp);
+
 	trace_xfs_trans_log_buf(bip);
 
 	/*
@@ -738,7 +739,7 @@ xfs_trans_binval(
 	 * We set the stale bit in the buffer as well since we're getting
 	 * rid of it.
 	 */
-	XFS_BUF_UNDELAYWRITE(bp);
+	xfs_buf_delwri_dequeue(bp);
 	XFS_BUF_STALE(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-- 
cgit v1.2.3-18-g5258


From c2b006c1da1602551def200e4661535f02b82488 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:07 +0000
Subject: xfs: let xfs_bwrite callers handle the xfs_buf_relse

Remove the xfs_buf_relse from xfs_bwrite and let the caller handle it to
mirror the delwri and read paths.

Also remove the mount pointer passed to xfs_bwrite, which is superflous now
that we have a mount pointer in the buftarg.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c        |  7 ++++---
 fs/xfs/xfs_buf.c         |  8 ++++----
 fs/xfs/xfs_buf.h         |  2 +-
 fs/xfs/xfs_dquot.c       |  8 ++++----
 fs/xfs/xfs_fsops.c       | 40 ++++++++++++++++++++++------------------
 fs/xfs/xfs_inode.c       |  8 ++++----
 fs/xfs/xfs_log_recover.c | 11 +++++++----
 fs/xfs/xfs_sync.c        |  6 ++++--
 8 files changed, 50 insertions(+), 40 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b097fd5a2b3..8f0f6583350 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2128,9 +2128,10 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
 			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
-		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
-			return (error);
-		}
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
 		src += tmp;
 		valuelen -= tmp;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 309eca75fad..63dbeb9efc4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1014,7 +1014,6 @@ xfs_buf_ioerror(
 
 int
 xfs_bwrite(
-	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	int			error;
@@ -1026,9 +1025,10 @@ xfs_bwrite(
 	xfs_bdstrat_cb(bp);
 
 	error = xfs_buf_iowait(bp);
-	if (error)
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	xfs_buf_relse(bp);
+	if (error) {
+		xfs_force_shutdown(bp->b_target->bt_mount,
+				   SHUTDOWN_META_IO_ERROR);
+	}
 	return error;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f1a8933becb..3f543ed3009 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -197,7 +197,7 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 	((bp)->b_sema.count <= 0)
 
 /* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern int xfs_bwrite(struct xfs_buf *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0f78dd46415..3e2ccaedc51 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1242,11 +1242,11 @@ xfs_qm_dqflush(
 	}
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
-	else {
+		error = xfs_bwrite(bp);
+	else
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
-	}
+
+	xfs_buf_relse(bp);
 
 	trace_xfs_dqflush_done(dqp);
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77ca..e023f940a3d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -216,10 +216,11 @@ xfs_growfs_data_private(
 		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
 		agf->agf_freeblks = cpu_to_be32(tmpsize);
 		agf->agf_longest = cpu_to_be32(tmpsize);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * AG inode header block
 		 */
@@ -240,10 +241,11 @@ xfs_growfs_data_private(
 		agi->agi_dirino = cpu_to_be32(NULLAGINO);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * BNO btree root block
 		 */
@@ -262,10 +264,11 @@ xfs_growfs_data_private(
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * CNT btree root block
 		 */
@@ -285,10 +288,11 @@ xfs_growfs_data_private(
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
 		nfree += be32_to_cpu(arec->ar_blockcount);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
+
 		/*
 		 * INO btree root block
 		 */
@@ -303,10 +307,10 @@ xfs_growfs_data_private(
 		block->bb_numrecs = 0;
 		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		error = xfs_bwrite(mp, bp);
-		if (error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
 			goto error0;
-		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
 	/*
@@ -396,9 +400,9 @@ xfs_growfs_data_private(
 		 * just issue a warning and continue.  The real work is
 		 * already done and committed.
 		 */
-		if (!(error = xfs_bwrite(mp, bp))) {
-			continue;
-		} else {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error) {
 			xfs_warn(mp,
 		"write error %d updating secondary superblock for ag %d",
 				error, agno);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f8fe1c66d42..7f237ba3c29 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2597,11 +2597,11 @@ xfs_iflush(
 		goto cluster_corrupt_out;
 
 	if (flags & SYNC_WAIT)
-		error = xfs_bwrite(mp, bp);
-	else {
+		error = xfs_bwrite(bp);
+	else
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
-	}
+
+	xfs_buf_relse(bp);
 	return error;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22946949bf5..be173852b2c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -268,9 +268,12 @@ xlog_bwrite(
 	xfs_buf_lock(bp);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 
-	if ((error = xfs_bwrite(log->l_mp, bp)))
+	error = xfs_bwrite(bp);
+	if (error) {
 		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
+	}
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -2172,15 +2175,15 @@ xlog_recover_buffer_pass2(
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
 		XFS_BUF_STALE(bp);
-		error = xfs_bwrite(mp, bp);
+		error = xfs_bwrite(bp);
 	} else {
 		ASSERT(bp->b_target->bt_mount == mp);
 		bp->b_iodone = xlog_recover_iodone;
 		xfs_buf_delwri_queue(bp);
-		xfs_buf_relse(bp);
 	}
 
-	return (error);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 4604f90f86a..90cc197e043 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -322,6 +322,7 @@ xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
 	struct xfs_buf		*bp;
+	int			error;
 
 	/*
 	 * If the buffer is pinned then push on the log so we won't get stuck
@@ -334,8 +335,9 @@ xfs_sync_fsdata(
 	bp = xfs_getsb(mp, 0);
 	if (xfs_buf_ispinned(bp))
 		xfs_log_force(mp, 0);
-
-	return xfs_bwrite(mp, bp);
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From c4e1c098ee8a72ea563a697a2b175868be86fdc9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:08 +0000
Subject: xfs: use the "delwri" terminology consistently

And also remove the strange local lock and delwri list pointers in a few
functions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf.c | 43 +++++++++++++++++++------------------------
 fs/xfs/xfs_buf.h |  4 ++--
 2 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 63dbeb9efc4..d3c2b58d7d7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1489,12 +1489,12 @@ xfs_setsize_buftarg(
 }
 
 STATIC int
-xfs_alloc_delwrite_queue(
+xfs_alloc_delwri_queue(
 	xfs_buftarg_t		*btp,
 	const char		*fsname)
 {
-	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
-	spin_lock_init(&btp->bt_delwrite_lock);
+	INIT_LIST_HEAD(&btp->bt_delwri_queue);
+	spin_lock_init(&btp->bt_delwri_lock);
 	btp->bt_flags = 0;
 	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
 	if (IS_ERR(btp->bt_task))
@@ -1524,7 +1524,7 @@ xfs_alloc_buftarg(
 	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
-	if (xfs_alloc_delwrite_queue(btp, fsname))
+	if (xfs_alloc_delwri_queue(btp, fsname))
 		goto error;
 	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1544,46 +1544,44 @@ void
 xfs_buf_delwri_queue(
 	xfs_buf_t		*bp)
 {
-	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
+	struct xfs_buftarg	*btp = bp->b_target;
 
 	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
 
 	ASSERT(!(bp->b_flags & XBF_READ));
 
-	spin_lock(dwlk);
+	spin_lock(&btp->bt_delwri_lock);
 	if (!list_empty(&bp->b_list)) {
 		/* if already in the queue, move it to the tail */
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		list_move_tail(&bp->b_list, dwq);
+		list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
 	} else {
 		/* start xfsbufd as it is about to have something to do */
-		if (list_empty(dwq))
+		if (list_empty(&btp->bt_delwri_queue))
 			wake_up_process(bp->b_target->bt_task);
 
 		atomic_inc(&bp->b_hold);
 		bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
-		list_add_tail(&bp->b_list, dwq);
+		list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
 	}
 	bp->b_queuetime = jiffies;
-	spin_unlock(dwlk);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 void
 xfs_buf_delwri_dequeue(
 	xfs_buf_t		*bp)
 {
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 	int			dequeued = 0;
 
-	spin_lock(dwlk);
+	spin_lock(&bp->b_target->bt_delwri_lock);
 	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
 		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
 		list_del_init(&bp->b_list);
 		dequeued = 1;
 	}
 	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-	spin_unlock(dwlk);
+	spin_unlock(&bp->b_target->bt_delwri_lock);
 
 	if (dequeued)
 		xfs_buf_rele(bp);
@@ -1615,9 +1613,9 @@ xfs_buf_delwri_promote(
 	if (bp->b_queuetime < jiffies - age)
 		return;
 	bp->b_queuetime = jiffies - age;
-	spin_lock(&btp->bt_delwrite_lock);
-	list_move(&bp->b_list, &btp->bt_delwrite_queue);
-	spin_unlock(&btp->bt_delwrite_lock);
+	spin_lock(&btp->bt_delwri_lock);
+	list_move(&bp->b_list, &btp->bt_delwri_queue);
+	spin_unlock(&btp->bt_delwri_lock);
 }
 
 STATIC void
@@ -1638,15 +1636,13 @@ xfs_buf_delwri_split(
 	unsigned long	age)
 {
 	xfs_buf_t	*bp, *n;
-	struct list_head *dwq = &target->bt_delwrite_queue;
-	spinlock_t	*dwlk = &target->bt_delwrite_lock;
 	int		skipped = 0;
 	int		force;
 
 	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	INIT_LIST_HEAD(list);
-	spin_lock(dwlk);
-	list_for_each_entry_safe(bp, n, dwq, b_list) {
+	spin_lock(&target->bt_delwri_lock);
+	list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
@@ -1663,10 +1659,9 @@ xfs_buf_delwri_split(
 		} else
 			skipped++;
 	}
-	spin_unlock(dwlk);
 
+	spin_unlock(&target->bt_delwri_lock);
 	return skipped;
-
 }
 
 /*
@@ -1716,7 +1711,7 @@ xfsbufd(
 		}
 
 		/* sleep for a long time if there is nothing to do. */
-		if (list_empty(&target->bt_delwrite_queue))
+		if (list_empty(&target->bt_delwri_queue))
 			tout = MAX_SCHEDULE_TIMEOUT;
 		schedule_timeout_interruptible(tout);
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3f543ed3009..c23826685d3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
 
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
-	struct list_head	bt_delwrite_queue;
-	spinlock_t		bt_delwrite_lock;
+	struct list_head	bt_delwri_queue;
+	spinlock_t		bt_delwri_lock;
 	unsigned long		bt_flags;
 
 	/* LRU control structures */
-- 
cgit v1.2.3-18-g5258


From 398d25ef23b10ce75424e0336a8d059dda1dbc8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:09 +0000
Subject: xfs: remove dead ENODEV handling in xfs_destroy_ioend

No driver returns ENODEV from it bio completion handler, not has this
ever been documented.  Remove the dead code dealing with it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8c37dde4c52..d91564404ab 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -122,17 +122,6 @@ xfs_destroy_ioend(
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 
-	/*
-	 * Volume managers supporting multiple paths can send back ENODEV
-	 * when the final path disappears.  In this case continuing to fill
-	 * the page cache with dirty data which cannot be written out is
-	 * evil, so prevent that.
-	 */
-	if (unlikely(ioend->io_error == -ENODEV)) {
-		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
-				      __FILE__, __LINE__);
-	}
-
 	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
-- 
cgit v1.2.3-18-g5258


From c859cdd1da008b3825555be3242908088a3de366 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:10 +0000
Subject: xfs: defer AIO/DIO completions

We really shouldn't complete AIO or DIO requests until we have finished
the unwritten extent conversion and size update.  This means fsync never
has to pick up any ioends as all work has been completed when signalling
I/O completion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 26 +++++++++-----------------
 fs/xfs/xfs_aops.h |  1 +
 2 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d91564404ab..10660c36410 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -122,6 +122,11 @@ xfs_destroy_ioend(
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 
+	if (ioend->io_iocb) {
+		if (ioend->io_isasync)
+			aio_complete(ioend->io_iocb, ioend->io_result, 0);
+		inode_dio_done(ioend->io_inode);
+	}
 	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
@@ -236,8 +241,6 @@ xfs_end_io(
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
 	} else {
-		if (ioend->io_iocb)
-			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		xfs_destroy_ioend(ioend);
 	}
 }
@@ -274,6 +277,7 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_isasync = 0;
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
@@ -1289,7 +1293,6 @@ xfs_end_io_direct_write(
 	bool			is_async)
 {
 	struct xfs_ioend	*ioend = iocb->private;
-	struct inode		*inode = ioend->io_inode;
 
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
@@ -1300,28 +1303,17 @@ xfs_end_io_direct_write(
 
 	ioend->io_offset = offset;
 	ioend->io_size = size;
+	ioend->io_iocb = iocb;
+	ioend->io_result = ret;
 	if (private && size > 0)
 		ioend->io_type = IO_UNWRITTEN;
 
 	if (is_async) {
-		/*
-		 * If we are converting an unwritten extent we need to delay
-		 * the AIO completion until after the unwrittent extent
-		 * conversion has completed, otherwise do it ASAP.
-		 */
-		if (ioend->io_type == IO_UNWRITTEN) {
-			ioend->io_iocb = iocb;
-			ioend->io_result = ret;
-		} else {
-			aio_complete(iocb, ret, 0);
-		}
+		ioend->io_isasync = 1;
 		xfs_finish_ioend(ioend);
 	} else {
 		xfs_finish_ioend_sync(ioend);
 	}
-
-	/* XXX: probably should move into the real I/O completion handler */
-	inode_dio_done(inode);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71..ce3dcb50762 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
+	unsigned int		io_isasync : 1;	/* needs aio_complete */
 	struct inode		*io_inode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
-- 
cgit v1.2.3-18-g5258


From fc0063c4474599b7a066ba76b90902abe21bc675 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:11 +0000
Subject: xfs: reduce ioend latency

There is no reason to queue up ioends for processing in user context
unless we actually need it.  Just complete ioends that do not convert
unwritten extents or need a size update from the end_io context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 10660c36410..e1ff0770784 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -149,6 +149,15 @@ xfs_ioend_new_eof(
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
 /*
  * Update on-disk file size now that data has been written to disk.  The
  * current in-memory file size is i_size.  If a write is beyond eof i_new_size
@@ -186,6 +195,9 @@ xfs_setfilesize(
 
 /*
  * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
  */
 STATIC void
 xfs_finish_ioend(
@@ -194,8 +206,10 @@ xfs_finish_ioend(
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		if (ioend->io_type == IO_UNWRITTEN)
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
-		else
+		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else
+			xfs_destroy_ioend(ioend);
 	}
 }
 
-- 
cgit v1.2.3-18-g5258


From 2b3ffd7eb7b4392e3657c5046b055ca9f1f7cf5e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:12 +0000
Subject: xfs: wait for I/O completion when writing out pages in
 xfs_setattr_size

The current code relies on the xfs_ioend_wait call later on to make sure
all I/O actually has completed.  The xfs_ioend_wait call will go away soon,
so prepare for that by using the waiting filemap function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 673704fab74..32aca87bde5 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -833,8 +833,8 @@ xfs_setattr_size(
 	 * care about here.
 	 */
 	if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
-					XBF_ASYNC, FI_NONE);
+		error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+					FI_NONE);
 		if (error)
 			goto out_unlock;
 	}
-- 
cgit v1.2.3-18-g5258


From 4a06fd262dbeb70a2c315f7259e063efa493fe3d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 23 Aug 2011 08:28:13 +0000
Subject: xfs: remove i_iocount

We now have an i_dio_count filed and surrounding infrastructure to wait
for direct I/O completion instead of i_icount, and we have never needed
to iocount waits for buffered I/O given that we only set the page uptodate
after finishing all required work.  Thus remove i_iocount, and replace
the actually needed waits with calls to inode_dio_wait.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c     | 39 +--------------------------------------
 fs/xfs/xfs_aops.h     |  3 ---
 fs/xfs/xfs_file.c     |  8 ++------
 fs/xfs/xfs_iget.c     |  2 --
 fs/xfs/xfs_inode.h    |  1 -
 fs/xfs/xfs_iops.c     |  4 ++--
 fs/xfs/xfs_super.c    |  7 +------
 fs/xfs/xfs_sync.c     |  8 ++------
 fs/xfs/xfs_vnodeops.c |  4 +---
 9 files changed, 9 insertions(+), 67 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e1ff0770784..46bba3e0af4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC		37
-#define to_ioend_wq(v)	(&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-
-void __init
-xfs_ioend_init(void)
-{
-	int i;
-
-	for (i = 0; i < NVSYNC; i++)
-		init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-
-void
-xfs_ioend_wait(
-	xfs_inode_t	*ip)
-{
-	wait_queue_head_t *wq = to_ioend_wq(ip);
-
-	wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-STATIC void
-xfs_ioend_wake(
-	xfs_inode_t	*ip)
-{
-	if (atomic_dec_and_test(&ip->i_iocount))
-		wake_up(to_ioend_wq(ip));
-}
-
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -115,7 +81,6 @@ xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
 	struct buffer_head	*bh, *next;
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
@@ -127,7 +92,7 @@ xfs_destroy_ioend(
 			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		inode_dio_done(ioend->io_inode);
 	}
-	xfs_ioend_wake(ip);
+
 	mempool_free(ioend, xfs_ioend_pool);
 }
 
@@ -298,7 +263,6 @@ xfs_alloc_ioend(
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
 	ioend->io_buffer_tail = NULL;
-	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 	ioend->io_iocb = NULL;
@@ -558,7 +522,6 @@ xfs_cancel_ioend(
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 
-		xfs_ioend_wake(XFS_I(ioend->io_inode));
 		mempool_free(ioend, xfs_ioend_pool);
 	} while ((ioend = next) != NULL);
 }
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ce3dcb50762..116dd5c3703 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -61,9 +61,6 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
-
 extern void xfs_count_page_state(struct page *, int *, int *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cbbac5cc9c2..ee63c4fb363 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -149,10 +149,6 @@ xfs_file_fsync(
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	xfs_ioend_wait(ip);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
 	if (mp->m_flags & XFS_MOUNT_BARRIER) {
 		/*
 		 * If we have an RT and/or log subvolume we need to make sure
@@ -758,7 +754,7 @@ restart:
  * the dio layer.  To avoid the problem with aio, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
  * Returns with locks held indicated by @iolock and errors indicated by
  * negative return values.
@@ -821,7 +817,7 @@ xfs_file_dio_aio_write(
 	 * otherwise demote the lock if we had to flush cached pages
 	 */
 	if (unaligned_io)
-		xfs_ioend_wait(ip);
+		inode_dio_wait(inode);
 	else if (*iolock == XFS_IOLOCK_EXCL) {
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		*iolock = XFS_IOLOCK_SHARED;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bb..0fa98b1c70e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
 		return NULL;
 	}
 
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
 	}
 
 	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbec..760140d1dd6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
 
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
-	atomic_t		i_iocount;	/* outstanding I/O count */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 32aca87bde5..e041e917c1d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -840,9 +840,9 @@ xfs_setattr_size(
 	}
 
 	/*
-	 * Wait for all I/O to complete.
+	 * Wait for all direct I/O to complete.
 	 */
-	xfs_ioend_wait(ip);
+	inode_dio_wait(inode);
 
 	error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
 				     xfs_get_blocks);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4f..54d5e102ffe 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -796,8 +796,6 @@ xfs_fs_destroy_inode(
 	if (is_bad_inode(inode))
 		goto out_reclaim;
 
-	xfs_ioend_wait(ip);
-
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
 	/*
@@ -837,7 +835,6 @@ xfs_fs_inode_init_once(
 	inode_init_once(VFS_I(ip));
 
 	/* xfs inode */
-	atomic_set(&ip->i_iocount, 0);
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 	init_waitqueue_head(&ip->i_ipin_wait);
@@ -914,9 +911,8 @@ xfs_fs_write_inode(
 		 * of forcing it all the way to stable storage using a
 		 * synchronous transaction we let the log force inside the
 		 * ->sync_fs call do that for thus, which reduces the number
-		 * of synchronous log foces dramatically.
+		 * of synchronous log forces dramatically.
 		 */
-		xfs_ioend_wait(ip);
 		error = xfs_log_inode(ip);
 		if (error)
 			goto out;
@@ -1681,7 +1677,6 @@ init_xfs_fs(void)
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 
-	xfs_ioend_init();
 	xfs_dir_startup();
 
 	error = xfs_init_zones();
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 90cc197e043..bf2b38c21ca 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
 	int			error = 0;
 
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		goto out_wait;
+		return 0;
 
 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
 		if (flags & SYNC_TRYLOCK)
-			goto out_wait;
+			return 0;
 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	}
 
 	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
 				0 : XBF_ASYNC, FI_NONE);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- out_wait:
-	if (flags & SYNC_WAIT)
-		xfs_ioend_wait(ip);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51fc429527b..c2ff0fc8656 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -647,8 +647,6 @@ xfs_inactive(
 	if (truncate) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-		xfs_ioend_wait(ip);
-
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_ITRUNCATE_LOG_RES(mp),
 					  0, XFS_TRANS_PERM_LOG_RES,
@@ -2076,7 +2074,7 @@ xfs_free_file_space(
 	if (need_iolock) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 		/* wait for the completion of any pending DIOs */
-		xfs_ioend_wait(ip);
+		inode_dio_wait(VFS_I(ip));
 	}
 
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-- 
cgit v1.2.3-18-g5258


From 859f57ca00805e6c482eef1a7ab073097d02c8ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 27 Aug 2011 14:45:11 +0000
Subject: xfs: avoid synchronous transactions when deleting attr blocks

Currently xfs_attr_inactive causes a synchronous transactions if we are
removing a file that has any extents allocated to the attribute fork, and
thus makes XFS extremely slow at removing files with out of line extended
attributes. The code looks a like a relict from the days before the busy
extent list, but with the busy extent list we avoid reusing data and attr
extents that have been freed but not commited yet, so this code is just
as superflous as the synchronous transactions for data blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c | 12 ------------
 fs/xfs/xfs_bmap.c | 10 +---------
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 8f0f6583350..3dd5c9c374c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	if (error)
 		goto out;
 
-	/*
-	 * Signal synchronous inactive transactions unless this is a
-	 * synchronous mount filesystem in which case we know that we're here
-	 * because we've been called out of xfs_inactive which means that the
-	 * last reference is gone and the unlink transaction has already hit
-	 * the disk so async inactive transactions are safe.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
-		if (dp->i_d.di_anextents > 0)
-			xfs_trans_set_sync(trans);
-	}
-
 	error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
 	if (error)
 		goto out;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 452a291383a..a6075c0f845 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3782,19 +3782,11 @@ xfs_bmap_compute_maxlevels(
  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  * caller.  Frees all the extents that need freeing, which must be done
  * last due to locking considerations.  We never free any extents in
- * the first transaction.  This is to allow the caller to make the first
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed.  This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
+ * the first transaction.
  *
  * Return 1 if the given transaction was committed and a new one
  * started, and 0 otherwise in the committed parameter.
  */
-/*ARGSUSED*/
 int						/* error */
 xfs_bmap_finish(
 	xfs_trans_t		**tp,		/* transaction pointer addr */
-- 
cgit v1.2.3-18-g5258


From c58cb165bd44de8aaee9755a144136ae743be116 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 27 Aug 2011 14:42:53 +0000
Subject: xfs: avoid direct I/O write vs buffered I/O race

Currently a buffered reader or writer can add pages to the pagecache
while we are waiting for the iolock in xfs_file_dio_aio_write.  Prevent
this by re-checking mapping->nrpages after we got the iolock, and if
nessecary upgrade the lock to exclusive mode.  To simplify this a bit
only take the ilock inside of xfs_file_aio_write_checks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_file.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ee63c4fb363..06fe97e56e4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -676,6 +676,7 @@ xfs_file_aio_write_checks(
 	xfs_fsize_t		new_size;
 	int			error = 0;
 
+	xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 	*new_sizep = 0;
 restart:
 	error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
@@ -798,14 +799,24 @@ xfs_file_dio_aio_write(
 		*iolock = XFS_IOLOCK_EXCL;
 	else
 		*iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
+
+	/*
+	 * Recheck if there are cached pages that need invalidate after we got
+	 * the iolock to protect against other threads adding new pages while
+	 * we were waiting for the iolock.
+	 */
+	if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
+	}
 
 	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
 		return ret;
 
 	if (mapping->nrpages) {
-		WARN_ON(*iolock != XFS_IOLOCK_EXCL);
 		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
 							FI_REMAPF_LOCKED);
 		if (ret)
@@ -851,7 +862,7 @@ xfs_file_buffered_aio_write(
 	size_t			count = ocount;
 
 	*iolock = XFS_IOLOCK_EXCL;
-	xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+	xfs_rw_ilock(ip, *iolock);
 
 	ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
 	if (ret)
-- 
cgit v1.2.3-18-g5258


From 04f658ee229f60dbb9a0dc2f3d6871b12b758051 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 24 Aug 2011 05:59:25 +0000
Subject: xfs: improve ioend error handling

Return unwritten extent conversion errors to aio_complete.

Skip both unwritten extent conversion and size updates if we had an
I/O error or the filesystem has been shut down.

Return -EIO to the aio/buffer completion handlers in case of a
forced shutdown.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_aops.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 46bba3e0af4..22aadf66786 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -88,8 +88,10 @@ xfs_destroy_ioend(
 	}
 
 	if (ioend->io_iocb) {
-		if (ioend->io_isasync)
-			aio_complete(ioend->io_iocb, ioend->io_result, 0);
+		if (ioend->io_isasync) {
+			aio_complete(ioend->io_iocb, ioend->io_error ?
+					ioend->io_error : ioend->io_result, 0);
+		}
 		inode_dio_done(ioend->io_inode);
 	}
 
@@ -141,9 +143,6 @@ xfs_setfilesize(
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 
-	if (unlikely(ioend->io_error))
-		return 0;
-
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 		return EAGAIN;
 
@@ -189,17 +188,24 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		error = -EIO;
+		goto done;
+	}
+	if (ioend->io_error)
+		goto done;
+
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
-	if (ioend->io_type == IO_UNWRITTEN &&
-	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-
+	if (ioend->io_type == IO_UNWRITTEN) {
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
-		if (error)
-			ioend->io_error = error;
+		if (error) {
+			ioend->io_error = -error;
+			goto done;
+		}
 	}
 
 	/*
@@ -209,6 +215,7 @@ xfs_end_io(
 	error = xfs_setfilesize(ioend);
 	ASSERT(!error || error == EAGAIN);
 
+done:
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
 	 * tail of the workqueue for another attempt later. Otherwise destroy
-- 
cgit v1.2.3-18-g5258


From b522950f0ab8551f2ef56c210ebd50e6c6396601 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Wed, 7 Sep 2011 19:37:54 +0000
Subject: xfs: Check the return value of xfs_buf_get()

Check the return value of xfs_buf_get() and fail appropriately.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_attr.c  |  4 ++--
 fs/xfs/xfs_fsops.c | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 3dd5c9c374c..c7dab0c0bdd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2109,8 +2109,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
 				 XBF_LOCK | XBF_DONT_BLOCK);
-		ASSERT(!xfs_buf_geterror(bp));
-
+		if (!bp)
+			return ENOMEM;
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
 		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e023f940a3d..1c6fdeb702f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agf = XFS_BUF_TO_AGF(bp);
 		memset(agf, 0, mp->m_sb.sb_sectsize);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -227,6 +231,10 @@ xfs_growfs_data_private(
 		bp = xfs_buf_get(mp->m_ddev_targp,
 				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		agi = XFS_BUF_TO_AGI(bp);
 		memset(agi, 0, mp->m_sb.sb_sectsize);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -253,6 +261,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -276,6 +288,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -300,6 +316,10 @@ xfs_growfs_data_private(
 				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
 				 BTOBB(mp->m_sb.sb_blocksize),
 				 XBF_LOCK | XBF_MAPPED);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
 		block = XFS_BUF_TO_BLOCK(bp);
 		memset(block, 0, mp->m_sb.sb_blocksize);
 		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-- 
cgit v1.2.3-18-g5258


From 2a30f36d9069b0646dcdd73def5fd7ab674bffd6 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 20 Sep 2011 13:56:55 +0000
Subject: xfs: Check the return value of xfs_trans_get_buf()