aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/xfs.txt13
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c140
-rw-r--r--fs/xfs/xfs_alloc.h3
-rw-r--r--fs/xfs/xfs_alloc_btree.c77
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c83
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c143
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c64
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c59
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c436
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c141
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c83
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c437
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_iomap.c31
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c241
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c146
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h60
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
70 files changed, 3747 insertions, 2311 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3fc0c31a6f5..3e4b3dd1e04 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -43,7 +43,7 @@ When mounting an XFS filesystem, the following options are accepted.
Issue command to let the block device reclaim space freed by the
filesystem. This is useful for SSD devices, thinly provisioned
LUNs and virtual machine images, but may have a performance
- impact. This option is incompatible with the nodelaylog option.
+ impact.
dmapi
Enable the DMAPI (Data Management API) event callouts.
@@ -72,8 +72,15 @@ When mounting an XFS filesystem, the following options are accepted.
Indicates that XFS is allowed to create inodes at any location
in the filesystem, including those which will result in inode
numbers occupying more than 32 bits of significance. This is
- provided for backwards compatibility, but causes problems for
- backup applications that cannot handle large inode numbers.
+ the default allocation option. Applications which do not handle
+ inode numbers bigger than 32 bits, should use inode32 option.
+
+ inode32
+ Indicates that XFS is limited to create inodes at locations which
+ will not result in inode numbers with more than 32 bits of
+ significance. This is provided for backwards compatibility, since
+ 64 bits inode numbers might cause problems for some applications
+ that cannot handle large inode numbers.
largeio/nolargeio
If "nolargeio" is specified, the optimal I/O reported in
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d..5a7ffe54f5d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
+ select LIBCRC32C
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2..d02201df855 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
xfs_file.o \
xfs_filestream.o \
xfs_fsops.o \
- xfs_fs_subr.o \
xfs_globals.o \
- xfs_iget.o \
+ xfs_icache.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mru_cache.o \
xfs_super.o \
- xfs_sync.o \
xfs_xattr.o \
xfs_rename.o \
xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262c..104db0f3bed 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+ memcpy(dst, src, sizeof(uuid_t));
+}
+
#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c..f2aeedb6a57 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+
/*
* Size of the unlinked inode hash table in the agi.
*/
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
in xfs_inode_ag_iterator */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 335206a9c69..393055fe3ae 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
return 0;
}
+static void
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+ /*
+ * we cannot actually do any verification of the AGFL because mkfs does
+ * not initialise the AGFL to zero or NULL. Hence the only valid part of
+ * the AGFL is what the AGF says is active. We can't get to the AGF, so
+ * we can't verify just those entries are valid.
+ *
+ * This problem goes away when the CRC format change comes along as that
+ * requires the AGFL to be initialised by mkfs. At that point, we can
+ * verify the blocks in the agfl -active or not- lie within the bounds
+ * of the AG. Until then, just leave this check ifdef'd out.
+ */
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ int agfl_ok = 1;
+
+ int i;
+
+ for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+ be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ agfl_ok = 0;
+ }
+
+ if (!agfl_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+#endif
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+};
+
/*
* Read in the allocation group free block array.
*/
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
@@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist(
return 0;
}
+static void
+xfs_agf_verify(
+ struct xfs_buf *bp)
+ {
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agf *agf;
+ int agf_ok;
+
+ agf = XFS_BUF_TO_AGF(bp);
+
+ agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag)
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+ bp->b_pag->pag_agno;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+ be32_to_cpu(agf->agf_length);
+
+ if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+};
+
/*
* Read in the allocation group header (free/alloc section).
*/
@@ -2102,44 +2213,19 @@ xfs_read_agf(
int flags, /* XFS_BUF_ */
struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- struct xfs_agf *agf; /* ag freelist header */
- int agf_ok; /* set if agf is consistent */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
if (!*bpp)
return 0;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
-
- /*
- * Validate the magic number of the agf block.
- */
- agf_ok =
- agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_seqno) == agno;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
- XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
return 0;
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index feacb061bab..99d0a610155 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f7876c6d616..b1ddef6b268 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,82 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+static void
+xfs_allocbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+ int sblock_ok; /* block passes checks */
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ * In this case, check against the maximum tree depth.
+ */
+ level = be16_to_cpu(block->bb_level);
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ case cpu_to_be32(XFS_ABTC_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ default:
+ sblock_ok = 0;
+ break;
+ }
+
+ /* numrecs verification */
+ sblock_ok = sblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+
+ /* sibling pointer verification */
+ sblock_ok = sblock_ok &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (!sblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_allocbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+static void
+xfs_allocbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+};
+
+
#ifdef DEBUG
STATIC int
xfs_allocbt_keys_inorder(
@@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_allocbt_keys_inorder,
.recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed87..7e89a2b429d 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e57e2daa357..4111a40ebe1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
ioend->io_append_trans = tp;
/*
- * We will pass freeze protection with a transaction. So tell lockdep
+ * We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
xfs_fsize_t isize;
/*
- * The transaction was allocated in the I/O submission thread,
- * thus we need to mark ourselves as beeing in a transaction
- * manually.
+ * The transaction may have been allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction manually.
+ * Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
if (ioend->io_type == XFS_IO_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
+ else if (ioend->io_append_trans ||
+ (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
- if (ioend->io_append_trans) {
- /*
- * We've got freeze protection passed with the transaction.
- * Tell lockdep about it.
- */
- rwsem_acquire_read(
- &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
- }
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ioend->io_error = -EIO;
goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
* range to normal written extens after the data I/O has finished.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
/*
- * For buffered I/O we never preallocate a transaction when
- * doing the unwritten extent conversion, but for direct I/O
- * we do not know if we are converting an unwritten extent
- * or not at the point where we preallocate the transaction.
+ * For direct I/O we do not know if we need to allocate blocks
+ * or not so we can't preallocate an append transaction as that
+ * results in nested reservations and log space deadlocks. Hence
+ * allocate the transaction here. While this is sub-optimal and
+ * can block IO completion for some time, we're stuck with doing
+ * it this way until we can pass the ioend to the direct IO
+ * allocation callbacks and avoid nesting that way.
*/
- if (ioend->io_append_trans) {
- ASSERT(ioend->io_isdirect);
-
- current_set_flags_nested(
- &ioend->io_append_trans->t_pflags, PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
-
- error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
- ioend->io_size);
- if (error) {
- ioend->io_error = -error;
+ error = xfs_setfilesize_trans_alloc(ioend);
+ if (error)
goto done;
- }
+ error = xfs_setfilesize(ioend);
} else if (ioend->io_append_trans) {
error = xfs_setfilesize(ioend);
- if (error)
- ioend->io_error = -error;
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
+ if (error)
+ ioend->io_error = -error;
xfs_destroy_ioend(ioend);
}
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
size_t size = iov_length(iov, nr_segs);
/*
- * We need to preallocate a transaction for a size update
- * here. In the case that this write both updates the size
- * and converts at least on unwritten extent we will cancel
- * the still clean transaction after the I/O has finished.
+ * We cannot preallocate a size update transaction here as we
+ * don't know whether allocation is necessary or not. Hence we
+ * can only tell IO completion that one is necessary if we are
+ * not doing unwritten extent conversion.
*/
iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
- if (offset + size > XFS_I(inode)->i_d.di_size) {
- ret = xfs_setfilesize_trans_alloc(ioend);
- if (ret)
- goto out_destroy_ioend;
+ if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- goto out_trans_cancel;
+ goto out_destroy_ioend;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
return ret;
-out_trans_cancel:
- if (ioend->io_append_trans) {
- current_set_flags_nested(&ioend->io_append_trans->t_pflags,
- PF_FSTRANS);
- rwsem_acquire_read(
- &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
out_destroy_ioend:
xfs_destroy_ioend(ioend);
return ret;
@@ -1641,7 +1618,7 @@ xfs_vm_bmap(
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+ filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d..aaf472532b3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;