aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c1031
1 files changed, 684 insertions, 347 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7e5bc872f2b..1f66779d7a4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,27 +17,32 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
+#include <linux/pagevec.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -85,7 +90,7 @@ xfs_rw_ilock_demote(
* valid before the operation, it will be read from disk before
* being partially zeroed.
*/
-STATIC int
+int
xfs_iozero(
struct xfs_inode *ip, /* inode */
loff_t pos, /* offset in file */
@@ -150,7 +155,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -163,7 +168,6 @@ xfs_file_fsync(
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
xfs_lsn_t lsn = 0;
@@ -194,75 +198,18 @@ xfs_file_fsync(
}
/*
- * We always need to make sure that the required inode state is safe on
- * disk. The inode might be clean but we still might need to force the
- * log because of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional changes to the
- * inode core that have to go to disk and this requires us to issue
- * a synchronous transaction to capture these changes correctly.
- *
- * This code relies on the assumption that if the i_update_core field
- * of the inode is clear and the inode is unpinned then it is clean
- * and no action is required.
+ * All metadata updates are logged, which means that we just have
+ * to flush the log up to the latest LSN that touched the inode.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- /*
- * First check if the VFS inode is marked dirty. All the dirtying
- * of non-transactional updates do not go through mark_inode_dirty*,
- * which allows us to distinguish between pure timestamp updates
- * and i_size updates which need to be caught for fdatasync.
- * After that also check for the dirty state in the XFS inode, which
- * might gets cleared when the inode gets written out via the AIL
- * or xfs_iflush_cluster.
- */
- if (((inode->i_state & I_DIRTY_DATASYNC) ||
- ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
- ip->i_update_core) {
- /*
- * Kick off a transaction to log the inode core to get the
- * updates. The sync transaction will also force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return -error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Note - it's possible that we might have pushed ourselves out
- * of the way during trans_reserve which would flush the inode.
- * But there's no guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer could be pinned
- * anyway if it's part of an inode in another recent
- * transaction. So we play it safe and fire off the
- * transaction anyway.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
-
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- /*
- * Timestamps/size haven't changed since last inode flush or
- * inode transaction commit. That means either nothing got
- * written or a transaction committed which caught the updates.
- * If the latter happened and the transaction hasn't hit the
- * disk yet, the inode will be still be pinned. If it is,
- * force the log.
- */
- if (xfs_ipincount(ip))
+ if (xfs_ipincount(ip)) {
+ if (!datasync ||
+ (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
}
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!error && lsn)
+ if (lsn)
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
/*
@@ -282,58 +229,40 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_read_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- size_t size = 0;
+ size_t size = iov_iter_count(to);
ssize_t ret = 0;
int ioflags = 0;
xfs_fsize_t n;
- unsigned long seg;
+ loff_t pos = iocb->ki_pos;
XFS_STATS_INC(xs_read_calls);
- BUG_ON(iocb->ki_pos != pos);
-
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- /* START copy & waste from filemap.c */
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *iv = &iovp[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- size += iv->iov_len;
- if (unlikely((ssize_t)(size|iv->iov_len) < 0))
- return XFS_ERROR(-EINVAL);
- }
- /* END copy & waste from filemap.c */
-
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((iocb->ki_pos & target->bt_smask) ||
- (size & target->bt_smask)) {
- if (iocb->ki_pos == i_size_read(inode))
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | size) & target->bt_logical_sectormask) {
+ if (pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
}
- n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+ n = mp->m_super->s_maxbytes - pos;
if (n <= 0 || size == 0)
return 0;
@@ -359,20 +288,21 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip,
- (iocb->ki_pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
+ ret = filemap_write_and_wait_range(
+ VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+ trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+ ret = generic_file_read_iter(iocb, to);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -413,155 +343,96 @@ xfs_file_splice_read(
}
/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t count,
- unsigned int flags)
-{
- struct inode *inode = outfilp->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- int ioflags = 0;
- ssize_t ret;
-
- XFS_STATS_INC(xs_write_calls);
-
- if (outfilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF. We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
+ * This routine is called to handle zeroing any space in the last block of the
+ * file that is beyond the EOF. We do this since the size is being increased
+ * without writing anything to that block and we don't want to read the
+ * garbage on the disk.
*/
STATIC int /* error (positive) */
xfs_zero_last_block(
- xfs_inode_t *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize)
+ struct xfs_inode *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
{
- xfs_fileoff_t last_fsb;
- xfs_mount_t *mp = ip->i_mount;
- int nimaps;
- int zero_offset;
- int zero_len;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- if (zero_offset == 0) {
- /*
- * There are no extra bytes in the last block on disk to
- * zero, so return.
- */
- return 0;
- }
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
+ int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ int zero_len;
+ int nimaps = 1;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
- last_fsb = XFS_B_TO_FSBT(mp, isize);
- nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
+
ASSERT(nimaps > 0);
+
/*
* If the block underlying isize is just a hole, then there
* is nothing to zero.
*/
- if (imap.br_startblock == HOLESTARTBLOCK) {
+ if (imap.br_startblock == HOLESTARTBLOCK)
return 0;
- }
- /*
- * Zero the part of the last block beyond the EOF, and write it
- * out sync. We need to drop the ilock while we do this so we
- * don't deadlock when the buffer cache calls back to us.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset)
zero_len = offset - isize;
- error = xfs_iozero(ip, isize, zero_len);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
+ return xfs_iozero(ip, isize, zero_len);
}
/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF. This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file. This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated. If fill is set,
- * then any holes in the range are filled and zeroed. If not, the holes
- * are left alone as holes.
+ * Zero any on disk space between the current EOF and the new, larger EOF.
+ *
+ * This handles the normal case of zeroing the remainder of the last block in
+ * the file and the unusual case of zeroing blocks out beyond the size of the
+ * file. This second case only happens with fixed size extents and when the
+ * system crashes before the inode size was updated but after blocks were
+ * allocated.
+ *
+ * Expects the iolock to be held exclusive, and will take the ilock internally.
*/
-
int /* error (positive) */
xfs_zero_eof(
- xfs_inode_t *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+ struct xfs_inode *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
/*
* First handle zeroing the block on which isize resides.
+ *
* We only zero a part of that block so it is handled specially.
*/
- error = xfs_zero_last_block(ip, offset, isize);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- return error;
+ if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error)
+ return error;
}
/*
- * Calculate the range between the new size and the old
- * where blocks needing to be zeroed may exist. To get the
- * block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back
- * to a block boundary. We subtract 1 in case the size is
- * exactly on a block boundary.
+ * Calculate the range between the new size and the old where blocks
+ * needing to be zeroed may exist.
+ *
+ * To get the block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back to a block
+ * boundary. We subtract 1 in case the size is exactly on a block
+ * boundary.
*/
last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
@@ -579,23 +450,18 @@ xfs_zero_eof(
while (start_zero_fsb <= end_zero_fsb) {
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
&imap, &nimaps, 0);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
+
ASSERT(nimaps > 0);
if (imap.br_state == XFS_EXT_UNWRITTEN ||
imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * This loop handles initializing pages that were
- * partially initialized by the code below this
- * loop. It basically zeroes the part of the page
- * that sits on a hole and sets the page as P_HOLE
- * and calls remapf if it is a mapped file.
- */
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
continue;
@@ -603,11 +469,7 @@ xfs_zero_eof(
/*
* There are blocks we need to zero.
- * Drop the inode lock while we're doing the I/O.
- * We'll still have the iolock to protect us.
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
@@ -615,22 +477,14 @@ xfs_zero_eof(
zero_len = offset - zero_off;
error = xfs_iozero(ip, zero_off, zero_len);
- if (error) {
- goto out_lock;
- }
+ if (error)
+ return error;
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
}
return 0;
-
-out_lock:
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
}
/*
@@ -651,38 +505,41 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
int error = 0;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
- if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
-
- if (likely(!(file->f_mode & FMODE_NOCMTIME)))
- file_update_time(file);
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
* write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which involves
- * dropping all locks and relocking to maintain correct locking order.
- * If we do this, restart the function to ensure all checks and values
- * are still valid.
+ * iolock shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
*/
if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_ilock(ip, *iolock);
goto restart;
}
error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+ if (error)
+ return error;
+ }
+
+ /*
+ * Updating the timestamps will grab the ilock again from
+ * xfs_fs_dirty_inode, so we have to call it after dropping the
+ * lock above. Eventually we should look into a way to avoid
+ * the pointless lock roundtrip.
+ */
+ if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+ error = file_update_time(file);
+ if (error)
+ return error;
}
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
/*
* If we're writing the file then make sure to clear the setuid and
@@ -690,7 +547,6 @@ restart:
* people from modifying setuid and setgid binaries.
*/
return file_remove_suid(file);
-
}
/*
@@ -721,10 +577,7 @@ restart:
STATIC ssize_t
xfs_file_dio_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -732,15 +585,18 @@ xfs_file_dio_aio_write(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
- size_t count = ocount;
int unaligned_io = 0;
int iolock;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (count & target->bt_smask))
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | count) & target->bt_logical_sectormask)
return -XFS_ERROR(EINVAL);
+ /* "unaligned" here means not aligned to a filesystem block */
if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
unaligned_io = 1;
@@ -771,12 +627,14 @@ xfs_file_dio_aio_write(
ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
goto out;
+ iov_iter_truncate(from, count);
if (mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
- FI_REMAPF_LOCKED);
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret)
goto out;
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
/*
@@ -791,8 +649,7 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ ret = generic_file_direct_write(iocb, from, pos);
out:
xfs_rw_iunlock(ip, iolock);
@@ -805,10 +662,7 @@ out:
STATIC ssize_t
xfs_file_buffered_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -817,7 +671,8 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- size_t count = ocount;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
xfs_rw_ilock(ip, iolock);
@@ -825,22 +680,24 @@ xfs_file_buffered_aio_write(
if (ret)
goto out;
+ iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, ret);
+ ret = generic_perform_write(file, from, pos);
+ if (likely(ret >= 0))
+ iocb->ki_pos = pos + ret;
/*
- * if we just got an ENOSPC, flush the inode now we aren't holding any
- * page locks and retry *once*
+ * If we just got an ENOSPC, try to write back all dirty inodes to
+ * convert delalloc space to free up some of the excess reserved
+ * metadata space.
*/
if (ret == -ENOSPC && !enospc) {
enospc = 1;
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (!ret)
- goto write_retry;
+ xfs_flush_inodes(ip->i_mount);
+ goto write_retry;
}
current->backing_dev_info = NULL;
@@ -850,40 +707,29 @@ out:
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_write_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- size_t ocount = 0;
+ size_t ocount = iov_iter_count(from);
XFS_STATS_INC(xs_write_calls);
- BUG_ON(iocb->ki_pos != pos);
-
- ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
- if (ret)
- return ret;
-
if (ocount == 0)
return 0;
- xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+ ret = xfs_file_dio_aio_write(iocb, from);
else
- ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount);
+ ret = xfs_file_buffered_aio_write(iocb, from);
if (ret > 0) {
ssize_t err;
@@ -891,54 +737,99 @@ xfs_file_aio_write(
XFS_STATS_ADD(xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- err = generic_write_sync(file, pos, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
-
return ret;
}
STATIC long
xfs_file_fallocate(
- struct file *file,
- int mode,
- loff_t offset,
- loff_t len)
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
- int cmd = XFS_IOC_RESVSP;
- int attr_flags = XFS_ATTR_NOLOCK;
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_trans *tp;
+ long error;
+ loff_t new_size = 0;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
-
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- cmd = XFS_IOC_UNRESVSP;
+ if (offset & blksize_mask || len & blksize_mask) {
+ error = EINVAL;
+ goto out_unlock;
+ }
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
+ /*
+ * There is no need to overlap collapse range with EOF,
+ * in which case it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ new_size = i_size_read(inode) - len;
+
+ error = xfs_collapse_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = -inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ error = xfs_zero_file_space(ip, offset, len);
+ else
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
}
- if (file->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+ if (file->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
if (error)
goto out_unlock;
@@ -948,12 +839,12 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = xfs_setattr_size(ip, &iattr);
}
out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
+ return -error;
}
@@ -986,9 +877,9 @@ xfs_dir_open(
* If there are any blocks, read-ahead block 0 as we're almost
* certain to have the next operation be a read there.
*/
- mode = xfs_ilock_map_shared(ip);
+ mode = xfs_ilock_data_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+ xfs_dir3_data_readahead(ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -1003,11 +894,10 @@ xfs_file_release(
STATIC int
xfs_file_readdir(
- struct file *filp,
- void *dirent,
- filldir_t filldir)
+ struct file *file,
+ struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
int error;
size_t bufsize;
@@ -1026,8 +916,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- error = xfs_readdir(ip, dirent, bufsize,
- (xfs_off_t *)&filp->f_pos, filldir);
+ error = xfs_readdir(ip, ctx, bufsize);
if (error)
return -error;
return 0;
@@ -1039,7 +928,6 @@ xfs_file_mmap(
struct vm_area_struct *vma)
{
vma->vm_ops = &xfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
file_accessed(filp);
return 0;
@@ -1059,14 +947,461 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+/*
+ * This type is designed to indicate the type of offset we would like
+ * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ */
+enum {
+ HOLE_OFF = 0,
+ DATA_OFF,
+};
+
+/*
+ * Lookup the desired type of offset from the given page.
+ *
+ * On success, return true and the offset argument will point to the
+ * start of the region that was found. Otherwise this function will
+ * return false and keep the offset argument unchanged.
+ */
+STATIC bool
+xfs_lookup_buffer_offset(
+ struct page *page,
+ loff_t *offset,
+ unsigned int type)
+{
+ loff_t lastoff = page_offset(page);
+ bool found = false;
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * Unwritten extents that have data in the page
+ * cache covering them can be identified by the
+ * BH_Unwritten state flag. Pages with multiple
+ * buffers might have a mix of holes, data and
+ * unwritten extents - any buffer with valid
+ * data in it should have BH_Uptodate flag set
+ * on it.
+ */
+ if (buffer_unwritten(bh) ||
+ buffer_uptodate(bh)) {
+ if (type == DATA_OFF)
+ found = true;
+ } else {
+ if (type == HOLE_OFF)
+ found = true;
+ }
+
+ if (found) {
+ *offset = lastoff;
+ break;
+ }
+ lastoff += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+
+ return found;
+}
+
+/*
+ * This routine is called to find out and return a data or hole offset
+ * from the page cache for unwritten extents according to the desired
+ * type for xfs_seek_data() or xfs_seek_hole().
+ *
+ * The argument offset is used to tell where we start to search from the
+ * page cache. Map is used to figure out the end points of the range to
+ * lookup pages.
+ *
+ * Return true if the desired type of offset was found, and the argument
+ * offset is filled with that address. Otherwise, return false and keep
+ * offset unchanged.
+ */
+STATIC bool
+xfs_find_get_desired_pgoff(
+ struct inode *inode,
+ struct xfs_bmbt_irec *map,
+ unsigned int type,
+ loff_t *offset)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff = *offset;
+ loff_t lastoff = startoff;
+ bool found = false;
+
+ pagevec_init(&pvec, 0);
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
+ end = endoff >> PAGE_CACHE_SHIFT;
+ do {
+ int want;
+ unsigned nr_pages;
+ unsigned int i;
+
+ want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ want);
+ /*
+ * No page mapped into given range. If we are searching holes
+ * and if this is the first time we got into the loop, it means
+ * that the given offset is landed in a hole, return it.
+ *
+ * If we have already stepped through some block buffers to find
+ * holes but they all contains data. In this case, the last
+ * offset is already updated and pointed to the end of the last
+ * mapped page, if it does not reach the endpoint to search,
+ * that means there should be a hole between them.
+ */
+ if (nr_pages == 0) {
+ /* Data search found nothing */
+ if (type == DATA_OFF)
+ break;
+
+ ASSERT(type == HOLE_OFF);
+ if (lastoff == startoff || lastoff < endoff) {
+ found = true;
+ *offset = lastoff;
+ }
+ break;
+ }
+
+ /*
+ * At lease we found one page. If this is the first time we
+ * step into the loop, and if the first page index offset is
+ * greater than the given search offset, a hole was found.
+ */
+ if (type == HOLE_OFF && lastoff == startoff &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = true;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ loff_t b_offset;
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to tmpfs
+ * file mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * Searching done if the page index is out of range.
+ * If the current offset is not reaches the end of
+ * the specified search range, there should be a hole
+ * between them.
+ */
+ if (page->index > end) {
+ if (type == HOLE_OFF && lastoff < endoff) {
+ *offset = lastoff;
+ found = true;
+ }
+ goto out;
+ }
+
+ lock_page(page);
+ /*
+ * Page truncated or invalidated(page->mapping == NULL).
+ * We can freely skip it and proceed to check the next
+ * page.
+ */
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ found = xfs_lookup_buffer_offset(page, &b_offset, type);
+ if (found) {
+ /*
+ * The found offset may be less than the start
+ * point to search if this is the first time to
+ * come here.
+ */
+ *offset = max_t(loff_t, startoff, b_offset);
+ unlock_page(page);
+ goto out;
+ }
+
+ /*
+ * We either searching data but nothing was found, or
+ * searching hole but found a data buffer. In either
+ * case, probably the next page contains the desired
+ * things, update the last offset to it so.
+ */
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The number of returned pages less than our desired, search
+ * done. In this case, nothing was found for searching data,
+ * but we found a hole behind the last offset.
+ */
+ if (nr_pages < want) {
+ if (type == HOLE_OFF) {
+ *offset = lastoff;
+ found = true;
+ }
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ /*
+ * Try to read extents from the first block indicated
+ * by fsbno to the end block of the file.
+ */
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ end = XFS_B_TO_FSB(mp, isize);
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a data extent */
+ if (map[i].br_startblock == DELAYSTARTBLOCK ||
+ (map[i].br_state == XFS_EXT_NORM &&
+ !isnullstartblock(map[i].br_startblock)))
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search data
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ DATA_OFF, &offset))
+ goto out;
+ }
+ }
+
+ /*
+ * map[0] is hole or its an unwritten extent but
+ * without data in page cache. Probably means that
+ * we are reading after EOF if nothing in map[1].
+ */
+ if (nmap == 1) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ ASSERT(i > 1);
+
+ /*
+ * Nothing was found, proceed to the next round of search
+ * if reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+ }
+
+out:
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out_unlock:
+ xfs_iunlock(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ end = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a hole */
+ if (map[i].br_startblock == HOLESTARTBLOCK)
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search hole
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ HOLE_OFF, &offset))
+ goto out;
+ }
+ }
+
+ /*
+ * map[0] contains data or its unwritten but contains
+ * data in page cache, probably means that we are
+ * reading after EOF. We should fix offset to point
+ * to the end of the file(i.e., there is an implicit
+ * hole at the end of any file).
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ ASSERT(i > 1);
+
+ /*
+ * Both mappings contains data, proceed to the next round of
+ * search if the current reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ offset = isize;
+ break;
+ }
+ }
+
+out:
+ /*
+ * At this point, we must have found a hole. However, the returned
+ * offset may be bigger than the file size as it may be aligned to
+ * page boundary for unwritten extents, we need to deal with this
+ * situation in particular.
+ */
+ offset = min_t(loff_t, offset, isize);
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out_unlock:
+ xfs_iunlock(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset);
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations xfs_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read,
- .aio_write = xfs_file_aio_write,
+ .llseek = xfs_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
- .splice_write = xfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
@@ -1081,7 +1416,7 @@ const struct file_operations xfs_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .readdir = xfs_file_readdir,
+ .iterate = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
@@ -1092,5 +1427,7 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};