aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c5935
1 files changed, 3815 insertions, 2120 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1f..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,20 +12,14 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*
* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
-#include <linux/ext4_jbd2.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
@@ -33,134 +27,131 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/aio.h>
+#include <linux/bitops.h>
+
+#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
-{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+#include <trace/events/ext4.h>
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
-}
+#define MPAGE_DA_EXTENT_TAIL 0x01
-/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u16 csum_lo;
+ __u16 csum_hi = 0;
+ __u32 csum;
+
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
+ raw->i_checksum_lo = 0;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
+ raw->i_checksum_hi = 0;
+ }
- might_sleep();
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+ EXT4_INODE_SIZE(inode->i_sb));
- BUFFER_TRACE(bh, "enter");
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %lx\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
+ return csum;
+}
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
+{
+ __u32 provided, calculated;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext4_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call jbd2_journal_forget");
- return ext4_journal_forget(handle, bh);
- }
- return 0;
- }
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
+
+ provided = le16_to_cpu(raw->i_checksum_lo);
+ calculated = ext4_inode_csum(inode, raw, ei);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+ else
+ calculated &= 0xFFFF;
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext4_journal_revoke");
- err = ext4_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_abort(inode->i_sb, __FUNCTION__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
+ return provided == calculated;
}
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- ext4_lblk_t needed;
-
- needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+ __u32 csum;
- /* Give ourselves just enough room to cope with inodes in which
- * i_blocks is corrupt: we've seen disk corruptions in the past
- * which resulted in random data in an inode which looked enough
- * like a regular file for ext4 to try to delete it. Things
- * will go a bit crazy if that happens, but at least we should
- * try not to panic the whole kernel. */
- if (needed < 2)
- needed = 2;
-
- /* But we need to bound the transaction so we don't overflow the
- * journal. */
- if (needed > EXT4_MAX_TRANS_DATA)
- needed = EXT4_MAX_TRANS_DATA;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
- return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+ csum = ext4_inode_csum(inode, raw, ei);
+ raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}
-/*
- * Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
{
- handle_t *result;
-
- result = ext4_journal_start(inode, blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
-
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
+ trace_ext4_begin_ordered_truncate(inode, new_size);
+ /*
+ * If jinode is zero, then we never opened the file for
+ * writing, so there's no need to call
+ * jbd2_journal_begin_ordered_truncate() since there's no
+ * outstanding writes we need to flush.
+ */
+ if (!EXT4_I(inode)->jinode)
+ return 0;
+ return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode,
+ new_size);
}
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
+
/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * Test whether an inode is a fast symlink.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
- return 0;
- if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
return 0;
- return 1;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
/*
@@ -168,40 +159,134 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, nblocks);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ return ret;
}
/*
* Called at the last iput() if i_nlink is zero.
*/
-void ext4_delete_inode (struct inode * inode)
+void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
+ int err;
- truncate_inode_pages(&inode->i_data, 0);
+ trace_ext4_evict_inode(inode);
+
+ if (inode->i_nlink) {
+ /*
+ * When journalling data dirty buffers are tracked only in the
+ * journal. So although mm thinks everything is clean and
+ * ready for reaping the inode might still have some pages to
+ * write in the running transaction or waiting to be
+ * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * (via truncate_inode_pages()) to discard these buffers can
+ * cause data loss. Also even if we did not discard these
+ * buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to
+ * read them before the transaction is checkpointed. So be
+ * careful and force everything to disk here... We use
+ * ei->i_datasync_tid to store the newest transaction
+ * containing inode's data.
+ *
+ * Note that directories do not have this problem because they
+ * don't use page cache.
+ */
+ if (ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT4_JOURNAL_INO) {
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+ jbd2_complete_transaction(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
+ goto no_delete;
+ }
+
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
+ truncate_inode_pages_final(&inode->i_data);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
- handle = start_transaction(inode);
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
* If we're going to skip the normal cleanup, we still need to
* make sure that the in-core orphan linked list is properly
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode->i_size = 0;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode dirty (err %d)", err);
+ goto stop_handle;
+ }
if (inode->i_blocks)
ext4_truncate(inode);
+
+ /*
+ * ext4_ext_truncate() doesn't reserve any slop when it
+ * restarts journal transactions; therefore there may not be
+ * enough credits left in the handle to remove the inode from
+ * the orphan list and set the dtime field.
+ */
+ if (!ext4_handle_has_enough_credits(handle, 3)) {
+ err = ext4_journal_extend(handle, 3);
+ if (err > 0)
+ err = ext4_journal_restart(handle, 3);
+ if (err != 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't extend journal (err %d)", err);
+ stop_handle:
+ ext4_journal_stop(handle);
+ ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
+ goto no_delete;
+ }
+ }
+
/*
* Kill off the orphan record which ext4_truncate created.
* AKPM: I think this can be inside the above `if'.
@@ -222,887 +307,543 @@ void ext4_delete_inode (struct inode * inode)
*/
if (ext4_mark_inode_dirty(handle, inode))
/* If that failed, just do the required in-core inode clear. */
- clear_inode(inode);
+ ext4_clear_inode(inode);
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
- clear_inode(inode); /* We must guarantee clearing of inode... */
+ ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
-typedef struct {
- __le32 *p;
- __le32 key;
- struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
{
- p->key = *(p->p = v);
- p->bh = bh;
+ return &EXT4_I(inode)->i_reserved_quota;
}
-
-/**
- * ext4_block_to_path - parse the block number into array of offsets
- * @inode: inode in question (we are only interested in its superblock)
- * @i_block: block number to be parsed
- * @offsets: array to store the offsets in
- * @boundary: set this non-zero if the referred-to block is likely to be
- * followed (on disk) by an indirect block.
- *
- * To store the locations of file's data ext4 uses a data structure common
- * for UNIX filesystems - tree of pointers anchored in the inode, with
- * data blocks at leaves and indirect blocks in intermediate nodes.
- * This function translates the block number into path in that tree -
- * return value is the path length and @offsets[n] is the offset of
- * pointer to (n+1)th node in the nth one. If @block is out of range
- * (negative or too large) warning is printed and zero returned.
- *
- * Note: function doesn't find node addresses, so no IO is needed. All
- * we need to know is the capacity of indirect blocks (taken from the
- * inode->i_sb).
- */
+#endif
/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a block located at @lblock
*/
-
-static int ext4_block_to_path(struct inode *inode,
- ext4_lblk_t i_block,
- ext4_lblk_t offsets[4], int *boundary)
-{
- int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
- const long direct_blocks = EXT4_NDIR_BLOCKS,
- indirect_blocks = ptrs,
- double_blocks = (1 << (ptrs_bits * 2));
- int n = 0;
- int final = 0;
-
- if (i_block < 0) {
- ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
- offsets[n++] = i_block;
- final = direct_blocks;
- } else if ( (i_block -= direct_blocks) < indirect_blocks) {
- offsets[n++] = EXT4_IND_BLOCK;
- offsets[n++] = i_block;
- final = ptrs;
- } else if ((i_block -= indirect_blocks) < double_blocks) {
- offsets[n++] = EXT4_DIND_BLOCK;
- offsets[n++] = i_block >> ptrs_bits;
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
- offsets[n++] = EXT4_TIND_BLOCK;
- offsets[n++] = i_block >> (ptrs_bits * 2);
- offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else {
- ext4_warning(inode->i_sb, "ext4_block_to_path",
- "block %lu > max",
- i_block + direct_blocks +
- indirect_blocks + double_blocks);
- }
- if (boundary)
- *boundary = final - 1 - (i_block & (ptrs - 1));
- return n;
-}
-
-/**
- * ext4_get_branch - read the chain of indirect blocks leading to data
- * @inode: inode in question
- * @depth: depth of the chain (1 - direct pointer, etc.)
- * @offsets: offsets of pointers in inode/indirect blocks
- * @chain: place to store the result
- * @err: here we store the error value
- *
- * Function fills the array of triples <key, p, bh> and returns %NULL
- * if everything went OK or the pointer to the last filled triple
- * (incomplete one) otherwise. Upon the return chain[i].key contains
- * the number of (i+1)-th block in the chain (as it is stored in memory,
- * i.e. little-endian 32-bit), chain[i].p contains the address of that
- * number (it points into struct inode for i==0 and into the bh->b_data
- * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- * block for i>0 and NULL for i==0. In other words, it holds the block
- * numbers of the chain, addresses they were taken from (and where we can
- * verify that chain did not change) and buffer_heads hosting these
- * numbers.
- *
- * Function stops when it stumbles upon zero pointer (absent block)
- * (pointer to last triple returned, *@err == 0)
- * or when it gets an IO error reading an indirect block
- * (ditto, *@err == -EIO)
- * or when it reads all @depth-1 indirect blocks successfully and finds
- * the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
- ext4_lblk_t *offsets,
- Indirect chain[4], int *err)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_calc_metadata_amount(inode, lblock);
- *err = 0;
- /* i_data is not going away, no lock needed */
- add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
- goto failure;
- add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
- /* Reader: end */
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-failure:
- *err = -EIO;
-no_block:
- return p;
+ return ext4_ind_calc_metadata_amount(inode, lblock);
}
-/**
- * ext4_find_near - find a place for allocation with sufficient locality
- * @inode: owner
- * @ind: descriptor of indirect block.
- *
- * This function returns the prefered place for block allocation.
- * It is used when heuristic for sequential allocation fails.
- * Rules are:
- * + if there is a block to the left of our position - allocate near it.
- * + if pointer will live in indirect block - allocate near that block.
- * + if pointer will live in inode - allocate in the same
- * cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group. The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- * Caller must make sure that @ind is valid and will stay that way.
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
*/
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
- __le32 *p;
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- /* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--) {
- if (*p)
- return le32_to_cpu(*p);
+ spin_lock(&ei->i_block_reservation_lock);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
}
- /* No such thing, so let's try location of indirect block */
- if (ind->bh)
- return ind->bh->b_blocknr;
-
- /*
- * It is going to be referred to from the inode itself? OK, just put it
- * into the same cylinder group then.
- */
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour;
-}
-
-/**
- * ext4_find_goal - find a prefered place for allocation.
- * @inode: owner
- * @block: block we want
- * @partial: pointer to the last triple within a chain
- *
- * Normally this function find the prefered place for block allocation,
- * returns it.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
- Indirect *partial)
-{
- struct ext4_block_alloc_info *block_i;
-
- block_i = EXT4_I(inode)->i_block_alloc_info;
-
- /*
- * try the heuristic for sequential allocation,
- * failing that at least try to get decent locality.
- */
- if (block_i && (block == block_i->last_alloc_logical_block + 1)
- && (block_i->last_alloc_physical_block != 0)) {
- return block_i->last_alloc_physical_block + 1;
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
}
- return ext4_find_near(inode, partial);
-}
-
-/**
- * ext4_blks_to_allocate: Look up the block map and count the number
- * of direct blocks need to be allocated for the given branch.
- *
- * @branch: chain of indirect blocks
- * @k: number of blocks need for indirect blocks
- * @blks: number of data blocks to be mapped.
- * @blocks_to_boundary: the offset in the indirect block
- *
- * return the total number of blocks to be allocate, including the
- * direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
- int blocks_to_boundary)
-{
- unsigned long count = 0;
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ used + ei->i_allocated_meta_blocks);
+ ei->i_allocated_meta_blocks = 0;
- /*
- * Simple case, [t,d]Indirect block(s) has not allocated yet
- * then it's clear blocks on that path have not allocated
- */
- if (k > 0) {
- /* right now we don't handle cross boundary allocation */
- if (blks < blocks_to_boundary + 1)
- count += blks;
- else
- count += blocks_to_boundary + 1;
- return count;
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
}
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- count++;
- while (count < blks && count <= blocks_to_boundary &&
- le32_to_cpu(*(branch[0].p + count)) == 0) {
- count++;
+ /* Update quota subsystem for data blocks */
+ if (quota_claim)
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
+ else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not re-claim the quota for fallocated blocks.
+ */
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
- return count;
-}
-
-/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- *
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @blks: on return it will store the total number of allocated
- * direct blocks
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- int target, i;
- unsigned long count = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
/*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
*/
- target = blks + indirect_blks;
-
- while (1) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
- if (*err)
- goto failed_out;
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
-
- if (count > 0)
- break;
- }
-
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
- /* total number of blocks allocated for direct blocks */
- ret = count;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i <index; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
- return ret;
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
}
-/**
- * ext4_alloc_branch - allocate and set up a chain of blocks.
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
- *
- * This function allocates blocks, zeroes out all but the last one,
- * links them into chain and (if we are synchronous) writes them to disk.
- * In other words, it prepares a branch that can be spliced onto the
- * inode. It stores the information about that chain in the branch[], in
- * the same format as ext4_get_branch() would do. We are calling it after
- * we had read the existing part of chain and partial points to the last
- * triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext4_get_block(), except that in one
- * place chain is disconnected - *branch->p is still zero (we did not
- * set the last link), but branch->key contains the number that should
- * be placed into *branch->p to fill that gap.
- *
- * If allocation fails we free all blocks we've allocated (and forget
- * their buffer_heads) and return the error value the from failed
- * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- * as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+static int __check_block_validity(struct inode *inode, const char *func,
+ unsigned int line,
+ struct ext4_map_blocks *map)
{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
-
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
- /*
- * metadata blocks and data blocks are allocated.
- */
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
- if (err) {
- unlock_buffer(bh);
- brelse(bh);
- goto failed;
- }
-
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if ( n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i=1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
-
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
- if (err)
- goto failed;
- }
- *blks = num;
- return err;
-failed:
- /* Allocation failed, free what we already allocated */
- for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, branch[i].bh);
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+ map->m_len)) {
+ ext4_error_inode(inode, func, line, map->m_pblk,
+ "lblock %lu mapped to illegal pblock "
+ "(length %d)", (unsigned long) map->m_lblk,
+ map->m_len);
+ return -EIO;
}
- for (i = 0; i <indirect_blks; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
-
- return err;
+ return 0;
}
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext4_alloc_branch)
- * @where: location of missing link
- * @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num, int blks)
-{
- int i;
- int err = 0;
- struct ext4_block_alloc_info *block_i;
- ext4_fsblk_t current_block;
+#define check_block_validity(inode, map) \
+ __check_block_validity((inode), __func__, __LINE__, (map))
- block_i = EXT4_I(inode)->i_block_alloc_info;
- /*
- * If we're splicing into a [td]indirect block (as opposed to the
- * inode) then we need to get write access to the [td]indirect block
- * before the splice.
- */
- if (where->bh) {
- BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
- if (err)
- goto err_out;
- }
- /* That's it */
-
- *where->p = where->key;
-
- /*
- * Update the host buffer_head or inode to point to more just allocated
- * direct blocks blocks
- */
- if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
- *(where->p + i ) = cpu_to_le32(current_block++);
- }
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *es_map,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int retval;
+ map->m_flags = 0;
/*
- * update the most recently allocated logical & physical block
- * in i_block_alloc_info, to assist find the proper goal block for next
- * allocation
+ * There is a race window that the result is not the same.
+ * e.g. xfstests #223 when dioread_nolock enables. The reason
+ * is that we lookup a block mapping in extent status tree with
+ * out taking i_data_sem. So at the time the unwritten extent
+ * could be converted.
*/
- if (block_i) {
- block_i->last_alloc_logical_block = block + blks - 1;
- block_i->last_alloc_physical_block =
- le32_to_cpu(where[num].key) + blks - 1;
- }
-
- /* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
-
- /* had we spliced it onto indirect block? */
- if (where->bh) {
- /*
- * If we spliced it onto an indirect block, we haven't
- * altered the inode. Note however that if it is being spliced
- * onto an indirect block at the very end of the file (the
- * file is growing) then we *will* alter the inode to reflect
- * the new i_size. But that is not done here - it is done in
- * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
- */
- jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, where->bh);
- if (err)
- goto err_out;
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- /*
- * OK, we spliced it into the inode itself on a direct block.
- * Inode was dirtied above.
- */
- jbd_debug(5, "splicing direct\n");
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
}
- return err;
-
-err_out:
- for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, where[i].bh);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(where[i-1].key), 1, 0);
- }
- ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
-
- return err;
-}
-
-/*
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
- */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize)
-{
- int err = -EIO;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- ext4_fsblk_t goal;
- int indirect_blks;
- int blocks_to_boundary = 0;
- int depth;
- struct ext4_inode_info *ei = EXT4_I(inode);
- int count = 0;
- ext4_fsblk_t first_block = 0;
-
-
- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
- J_ASSERT(handle != NULL || create == 0);
- depth = ext4_block_to_path(inode, iblock, offsets,
- &blocks_to_boundary);
-
- if (depth == 0)
- goto out;
-
- partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
- first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result);
- count++;
- /*map more blocks*/
- while (count < maxblocks && count <= blocks_to_boundary) {
- ext4_fsblk_t blk;
-
- blk = le32_to_cpu(*(chain[depth-1].p + count));
-
- if (blk == first_block + count)
- count++;
- else
- break;
- }
- goto got_it;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO)
- goto cleanup;
-
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
/*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary
- */
- if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
- ext4_init_block_alloc_info(inode);
-
- goal = ext4_find_goal(inode, iblock, partial);
-
- /* the number of blocks need to allocate for [d,t]indirect blocks */
- indirect_blks = (chain + depth) - partial - 1;
-
- /*
- * Next look up the indirect map to count the totoal number of
- * direct blocks to allocate for this branch.
+ * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+ * because it shouldn't be marked in es_map->m_flags.
*/
- count = ext4_blks_to_allocate(partial, indirect_blks,
- maxblocks, blocks_to_boundary);
- /*
- * Block out ext4_truncate while we alter the tree
- */
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
/*
- * The ext4_splice_branch call will free and forget any buffers
- * on the new chain if there is a failure, but that risks using
- * up transaction credits, especially for bitmaps where the
- * credits cannot be returned. Can we handle this somehow? We
- * may need to return -EAGAIN upwards in the worst case. --sct
+ * We don't check m_len because extent will be collpased in status
+ * tree. So the m_len might not equal.
*/
- if (!err)
- err = ext4_splice_branch(handle, inode, iblock,
- partial, indirect_blks, count);
- /*
- * i_disksize growing is protected by i_data_sem. Don't forget to
- * protect it if you're about to implement concurrent
- * ext4_get_block() -bzzz
- */
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
- if (err)
- goto cleanup;
-
- set_buffer_new(bh_result);
-got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
- if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
- err = count;
- /* Clean up and exit */
- partial = chain + depth - 1; /* the whole chain */
-cleanup:
- while (partial > chain) {
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
- BUFFER_TRACE(bh_result, "returned");
-out:
- return err;
+ if (es_map->m_lblk != map->m_lblk ||
+ es_map->m_flags != map->m_flags ||
+ es_map->m_pblk != map->m_pblk) {
+ printk("ES cache assertion failed for inode: %lu "
+ "es_cached ex [%d/%d/%llu/%x] != "
+ "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+ inode->i_ino, es_map->m_lblk, es_map->m_len,
+ es_map->m_pblk, es_map->m_flags, map->m_lblk,
+ map->m_len, map->m_pblk, map->m_flags,
+ retval, flags);
+ }
}
+#endif /* ES_AGGRESSIVE_TEST */
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
-
-
-/*
- *
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
*
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it
* mapped.
*
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
*
* It returns the error in case of allocation failure.
*/
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
- unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
+ int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
- clear_buffer_mapped(bh);
+ map->m_flags = 0;
+ ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, map->m_len,
+ (unsigned long) map->m_lblk);
/*
- * Try to see if we can get the block without requesting
- * for new file system block.
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
*/
- down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, 0, 0);
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(handle, inode, map,
+ &orig_map, flags);
+#endif
+ goto found;
+ }
+
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- retval = ext4_get_blocks_handle(handle,
- inode, block, max_blocks, bh, 0, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ }
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
}
- up_read((&EXT4_I(inode)->i_data_sem));
/* If it is only a block(s) look up */
- if (!create)
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
return retval;
/*
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns th create = 0
+ * ext4_ext_get_block() returns the create = 0
* with buffer head unmapped.
*/
- if (retval > 0 && buffer_mapped(bh))
- return retval;
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, create, extend_disksize);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
- retval = ext4_get_blocks_handle(handle, inode, block,
- max_blocks, bh, create, extend_disksize);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+ /*
+ * We allocated new blocks which will result in
+ * i_data's format changing. Force the migrate
+ * to fail by clearing migrate flags
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
}
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es))
+ goto has_zeroout;
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+
+has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
+ }
return retval;
}
-static int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int flags)
{
handle_t *handle = ext4_journal_current_handle();
+ struct ext4_map_blocks map;
int ret = 0, started = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
- if (create && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- handle = ext4_journal_start(inode, DIO_CREDITS +
- 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out;
+ return ret;
}
started = 1;
}
- ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
if (started)
ext4_journal_stop(handle);
-out:
return ret;
}
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ return _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *errp)
{
- struct buffer_head dummy;
+ struct ext4_map_blocks map;
+ struct buffer_head *bh;
int fatal = 0, err;
J_ASSERT(handle != NULL || create == 0);
- dummy.b_state = 0;
- dummy.b_blocknr = -1000;
- buffer_trace_init(&dummy.b_history);
- err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
- /*
- * ext4_get_blocks_handle() returns number of blocks
- * mapped. 0 in case of a HOLE.
- */
- if (err > 0) {
- if (err > 1)
- WARN_ON(1);
- err = 0;
+ map.m_lblk = block;
+ map.m_len = 1;
+ err = ext4_map_blocks(handle, inode, &map,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
+ if (err < 0)
+ *errp = err;
+ if (err <= 0)
+ return NULL;
+
+ bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
+ return NULL;
}
- *errp = err;
- if (!err && buffer_mapped(&dummy)) {
- struct buffer_head *bh;
- bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
- goto err;
- }
- if (buffer_new(&dummy)) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != NULL);
- /*
- * Now that we do not always journal data, we should
- * keep in mind whether this should always journal the
- * new buffer as metadata. For now, regular file
- * writes use ext4_get_block instead, so it's not a
- * problem.
- */
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext4_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data,0,inode->i_sb->s_blocksize);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
- if (!fatal)
- fatal = err;
- } else {
- BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
}
- return bh;
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
}
-err:
- return NULL;
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *err)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
bh = ext4_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1111,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -1125,10 +866,9 @@ static int walk_page_buffers( handle_t *handle,
int err, ret = 0;
struct buffer_head *next;
- for ( bh = head, block_start = 0;
- ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
- {
+ for (bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next) {
next = bh->b_this_page;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
@@ -1150,11 +890,10 @@ static int walk_page_buffers( handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -1168,103 +907,153 @@ static int walk_page_buffers( handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
+ int dirty = buffer_dirty(bh);
+ int ret;
+
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- return ext4_journal_get_write_access(handle, bh);
+ /*
+ * __block_write_begin() could have dirtied some buffers. Clean
+ * the dirty bit as jbd2_journal_get_write_access() could complain
+ * otherwise about fs integrity issues. Setting of the dirty bit
+ * by __block_write_begin() isn't a real problem here as we clear
+ * the bit before releasing a page lock and thus writeback cannot
+ * ever write the buffer.
+ */
+ if (dirty)
+ clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (!ret && dirty)
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ return ret;
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- struct inode *inode = mapping->host;
- int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ struct inode *inode = mapping->host;
+ int ret, needed_blocks;
handle_t *handle;
int retries = 0;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ trace_ext4_write_begin(inode, pos, len, flags);
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
-retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
- ret = PTR_ERR(handle);
- goto out;
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_get_block);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_journal_stop(handle);
+ goto retry_grab;
+ }
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
+ unlock_page(page);
+ /*
+ * __block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before
+ * truncate finishes
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
+
ext4_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_journal_dirty_metadata(handle, bh);
-}
-
-/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
/*
@@ -1274,168 +1063,660 @@ static int ext4_generic_write_end(struct file *file,
* ext4 never places buffers on inode->i_mapping->private_list. metadata
* buffers are managed internally.
*/
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- unsigned from, to;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
+ int i_size_changed = 0;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
- if (ret == 0) {
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
- loff_t new_i_size;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hole i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = 1;
+ }
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
- copied = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * but greater than i_disksize. (hint delalloc)
+ */
+ ext4_update_i_disksize(inode, (pos + copied));
+ i_size_changed = 1;
}
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
unlock_page(page);
page_cache_release(page);
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- int ret = 0, ret2;
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
-
- copied = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ ext4_mark_inode_dirty(handle, inode);
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
+
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
return ret ? ret : copied;
}
static int ext4_journalled_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
+ loff_t new_i_size;
+ trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
- if (pos+copied > inode->i_size)
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
+
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
+ new_i_size = pos + copied;
+ if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- if (inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
return ret ? ret : copied;
}
/*
- * bmap() is special. It gets used by applications such as lilo and by
- * the swapper to find the on-disk block of a specific piece of data.
- *
- * Naturally, this is dangerous if the block concerned is still in the
- * journal. If somebody makes a swapfile on an ext4 data-journaling
- * filesystem and enables swap, then they may get a nasty shock when the
- * data getting swapped to that swapfile suddenly gets overwritten by
- * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
- *
- * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
+ * Reserve a metadata for a single block located at lblock
*/
-static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- struct inode *inode = mapping->host;
- journal_t *journal;
- int err;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
- if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
+ /*
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
+ */
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ if (ret)
+ return ret;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ return -ENOSPC;
+ }
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!to_free)
+ return; /* Nothing to release, exit */
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ trace_ext4_da_release_space(inode, to_free);
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
- * This is a REALLY heavyweight approach, but the use of
- * bmap on dirty files is expected to be extremely rare:
- * only if we run lilo or swapon on a freshly made file
- * do we expect this to happen.
- *
- * (bmap requires CAP_SYS_RAWIO so this does not
- * represent an unprivileged user DOS attack --- we'd be
- * in trouble if mortal users could trigger this path at
- * will.)
- *
- * NB. EXT4_STATE_JDATA is not set on files other than
- * regular files. If somebody wants to bmap a directory
- * or symlink and gets confused because the buffer
- * hasn't yet been flushed to disk, they deserve
- * everything they get.
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
*/
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
+ }
+ ei->i_reserved_data_blocks -= to_free;
- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
- journal = EXT4_JOURNAL(inode);
- jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
- jbd2_journal_unlock_updates(journal);
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
- if (err)
+ /* update fs dirty data blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
+ int num_clusters;
+ ext4_fsblk_t lblk;
+
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if (next_off > stop)
+ break;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
+
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
+ /*
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
+ */
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
+
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
+{
+ int nr_pages, i;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
+
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
+
+ pagevec_init(&pvec, 0);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > end)
+ break;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ index = pvec.pages[nr_pages - 1]->index + 1;
+ pagevec_release(&pvec);
+ }
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(sb)));
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ ei->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
+ return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+ return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
+{
+ struct extent_status es;
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto add_delayed;
+ }
+
+ /*
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
+ */
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return 0;
+ }
+
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+ return retval;
}
- return generic_block_bmap(mapping,block,ext4_get_block);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_has_inline_data(inode)) {
+ /*
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
+ */
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+
+add_delayed:
+ if (retval == 0) {
+ int ret;
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ /*
+ * If the block was allocated from previously allocated cluster,
+ * then we don't need to reserve it again. However we still need
+ * to reserve metadata for every block we're going to write.
+ */
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = ext4_da_reserve_metadata(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ }
+
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
+ goto out_unlock;
+ }
+
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
+ */
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
+ }
+
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
+}
+
+/*
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin(). It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ struct ext4_map_blocks map;
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+ map.m_lblk = iblock;
+ map.m_len = 1;
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+ if (buffer_unwritten(bh)) {
+ /* A delayed write to unwritten bh should be marked
+ * new and mapped. Mapped ensures that we don't do
+ * get_block multiple times when we write to the same
+ * offset and new ensures that we do proper zero out
+ * for partial write.
+ */
+ set_buffer_new(bh);
+ set_buffer_mapped(bh);
+ }
+ return 0;
}
static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -1450,338 +1731,1527 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int __ext4_journalled_writepage(struct page *page,
+ unsigned int len)
{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs = NULL;
+ handle_t *handle = NULL;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
+
+ ClearPageChecked(page);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
+
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ NULL, bput_one);
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+ brelse(inode_bh);
+ return ret;
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ * - ext4_writepages after taking page lock (have journal handle)
+ * - journal_submit_inode_data_buffers (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
+ * - grab_page_cache when doing write_begin (have journal handle)
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
*
- * Problem:
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
*
* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
* ext4_writepage()
*
- * Similar for:
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned int len;
+ struct buffer_head *page_bufs = NULL;
+ struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
+
+ trace_ext4_writepage(page);
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ page_bufs = page_buffers(page);
+ /*
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
+ */
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
+ unlock_page(page);
+ return 0;
+ }
+ keep_towrite = true;
+ }
+
+ if (PageChecked(page) && ext4_should_journal_data(inode))
+ /*
+ * It's mmapped pagecache. Add buffers and journal it. There
+ * doesn't seem much point in redirtying the page here.
+ */
+ return __ext4_journalled_writepage(page, len);
+
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+ ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
+ return ret;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
+/*
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
+ */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
+
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
*
- * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
+{
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
+ }
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
*
- * Same applies to ext4_get_block(). We will deadlock on various things like
- * lock_journal and i_data_sem
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
*
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ int err;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
+ }
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
*
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
+ * @mpd - description of extent to map, on return next extent to map
*
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Up to 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
+ pagevec_release(&pvec);
+ if (err > 0)
+ err = 0;
+ return err;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err, dioread_nolock;
+
+ trace_ext4_da_write_pages_extent(inode, map);
+ /*
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an unwritten extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
+ */
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
*
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
*
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
+
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ do {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ } while (map->m_len);
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
*
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
+ * @mpd - where to look for pages
*
- * We don't honour synchronous mounts for writepage(). That would be
- * disastrous. Any write() or metadata operation will sync the fs for
- * us.
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
*/
-static int ext4_ordered_writepage(struct page *page,
- struct writeback_control *wbc)
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
- struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
+ while (index <= end) {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ goto out;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end)
+ goto out;
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
+
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
+
+ lock_page(page);
+ /*
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
+ */
+ if (!PageDirty(page) ||
+ (PageWriteback(page) &&
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+ unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ wait_on_page_writeback(page);
+ BUG_ON(PageWriteback(page));
+
+ if (mpd->map.m_len == 0)
+ mpd->first_page = page->index;
+ mpd->next_page = page->index + 1;
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ left--;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
+}
+
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
+ int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ struct mpage_da_data mpd;
+ struct inode *inode = mapping->host;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ bool done;
+ struct blk_plug plug;
+ bool give_up_on_write = false;
- J_ASSERT(PageLocked(page));
+ trace_ext4_writepages(inode, wbc);
/*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
*/
- if (ext4_journal_current_handle())
- goto out_fail;
+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ goto out_writepages;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ goto out_writepages;
}
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ /*
+ * If the filesystem has aborted, it is read-only, so return
+ * right away instead of dumping stack traces later on that
+ * will obscure the real source of the problem. We test
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+ * the latter could be true if the filesystem is mounted
+ * read-only, and in that case, ext4_writepages should
+ * *never* be called, so if that ever happens, we would want
+ * the stack trace.
+ */
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
/*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
*/
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
+ cycled = 0;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
+ } else {
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+ }
+
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
+ blk_start_plug(&plug);
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /*
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
+ */
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+ "%ld pages, ino %lu; err %d", __func__,
+ wbc->nr_to_write, inode->i_ino, ret);
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
+ }
+
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
+ }
+ ext4_journal_stop(handle);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
+ jbd2_journal_force_commit_nested(sbi->s_journal);
+ ret = 0;
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
+ break;
+ }
+ blk_finish_plug(&plug);
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
+ cycled = 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
+ goto retry;
+ }
+
+ /* Update index */
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * Set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = mpd.first_page;
+
+out_writepages:
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
+ return ret;
+}
+
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_clusters, dirty_clusters;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with percpu_counter_batch getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+ /*
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
+ */
+ return 1;
}
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
+ return 0;
+}
-out_fail:
- redirty_page_for_writepage(wbc, page);
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
+ trace_ext4_da_write_begin(inode, pos, len, flags);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
+
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
unlock_page(page);
+
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
+ }
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_journal_stop(handle);
+ goto retry_grab;
+ }
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
+
+ ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
+ }
+
+ *pagep = page;
return ret;
}
-static int ext4_writeback_writepage(struct page *page,
- struct writeback_control *wbc)
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
{
+ struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ unsigned int idx;
+ int i;
- if (ext4_journal_current_handle())
- goto out_fail;
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
+ for (i = 0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+ int write_mode = (int)(unsigned long)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+
+ trace_ext4_da_write_end(inode, pos, len, copied);
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied - 1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+ new_i_size = pos + copied;
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_i_size;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
- err = ext4_journal_stop(handle);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
if (!ret)
- ret = err;
- return ret;
+ ret = ret2;
-out_fail:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return ret;
+ return ret ? ret : copied;
}
-static int ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
- if (ext4_journal_current_handle())
- goto no_write;
+ ext4_da_page_release_reservation(page, offset, length);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
+out:
+ ext4_invalidatepage(page, offset, length);
+
+ return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ trace_ext4_alloc_da_blocks(inode);
+
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writepage() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them because we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
- if (!page_has_buffers(page) || PageChecked(page)) {
+/*
+ * bmap() is special. It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal. If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ journal_t *journal;
+ int err;
+
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
/*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
*/
- ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ filemap_write_and_wait(mapping);
+ }
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
- } else {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+ * only if we run lilo or swapon on a freshly made file
+ * do we expect this to happen.
+ *
+ * (bmap requires CAP_SYS_RAWIO so this does not
+ * represent an unprivileged user DOS attack --- we'd be
+ * in trouble if mortal users could trigger this path at
+ * will.)
+ *
+ * NB. EXT4_STATE_JDATA is not set on files other than
+ * regular files. If somebody wants to bmap a directory
+ * or symlink and gets confused because the buffer
+ * hasn't yet been flushed to disk, they deserve
+ * everything they get.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+ journal = EXT4_JOURNAL(inode);
+ jbd2_journal_lock_updates(journal);
+ err = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
+
+ if (err)
+ return 0;
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-no_write:
- redirty_page_for_writepage(wbc, page);
-out_unlock:
- unlock_page(page);
- goto out;
+ return generic_block_bmap(mapping, block, ext4_get_block);
}
static int ext4_readpage(struct file *file, struct page *page)
{
- return mpage_readpage(page, ext4_get_block);
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
+ trace_ext4_readpage(page);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ trace_ext4_invalidatepage(page, offset, length);
+
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset, length);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
+ trace_ext4_releasepage(page);
+
+ /* Page has dirty journalled data -> cannot release */
+ if (PageChecked(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ else
+ return try_to_free_buffers(page);
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+
+ /* if not async direct IO just return */
+ if (!io_end)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ iocb->private = NULL;
+ io_end->offset = offset;
+ io_end->size = size;
+ ext4_put_io_end(io_end);
}
/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
+ *
+ * The unwritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
*
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- handle_t *handle;
ssize_t ret;
- int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
+ loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
- if (rw == WRITE) {
- loff_t final_size = offset + count;
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
+ BUG_ON(iocb->private == NULL);
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
- if (orphan) {
- int err;
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- goto out;
+ if (overwrite) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * unwritten to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents unwritten.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
}
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
+
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ /*
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
+ */
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
}
- err = ext4_journal_stop(handle);
- if (ret == 0)
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(NULL, inode,
+ offset, ret);
+ if (err < 0)
ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
-out:
+
+retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ return ret;
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
+
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+ else
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -1804,88 +3274,110 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+static const struct address_space_operations ext4_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations ext4_writeback_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+static const struct address_space_operations ext4_journalled_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_journalled_write_end,
+ .set_page_dirty = ext4_journalled_set_page_dirty,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_journalled_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_journalled_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_writepages,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
- inode->i_mapping->a_ops = &ext4_ordered_aops;
- else if (ext4_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext4_writeback_aops;
- else
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
+ return;
+ default:
+ BUG();
+ }
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_aops;
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
- struct address_space *mapping, loff_t from)
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
+
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ max = blocksize - (offset & (blocksize - 1));
/*
- * For "nobh" option, we can only work if we don't need to
- * read-in the page - otherwise we create buffers to do the IO.
+ * correct length if it does not fall between
+ * 'from' and the end of the block
*/
- if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
- ext4_should_writeback_data(inode) && PageUptodate(page)) {
- zero_user(page, offset, length);
- set_page_dirty(page);
- goto unlock;
- }
+ if (length > max || length < 0)
+ length = max;
+
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -1898,13 +3390,10 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
iblock++;
pos += blocksize;
}
-
- err = 0;
if (buffer_freed(bh)) {
BUFFER_TRACE(bh, "freed: skip");
goto unlock;
}
-
if (!buffer_mapped(bh)) {
BUFFER_TRACE(bh, "unmapped");
ext4_get_block(inode, iblock, bh, 0);
@@ -1927,25 +3416,22 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
if (!buffer_uptodate(bh))
goto unlock;
}
-
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto unlock;
}
-
zero_user(page, offset, length);
-
BUFFER_TRACE(bh, "zeroed end of block");
- err = 0;
if (ext4_should_journal_data(inode)) {
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
} else {
- if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = 0;
mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
}
unlock:
@@ -1955,342 +3441,235 @@ unlock:
}
/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-static inline int all_zeroes(__le32 *p, __le32 *q)
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
-/**
- * ext4_find_shared - find the indirect blocks for partial truncation.
- * @inode: inode in question
- * @depth: depth of the affected branch
- * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
- * @chain: place to store the pointers to partial indirect blocks
- * @top: place to the (detached) top of branch
- *
- * This is a helper function used by ext4_truncate().
- *
- * When we do truncate() we may have to clean the ends of several
- * indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
- * from it (and it is on the path to the first completely truncated
- * data block, indeed). We have to free the top of that path along
- * with everything to the right of the path. Since no allocation
- * past the truncation point is possible until ext4_truncate()
- * finishes, we may safely do the latter, but top of branch may
- * require special attention - pageout below the truncation point
- * might try to populate it.
- *
- * We atomically detach the top of branch from the tree, store the
- * block number of its root in *@top, pointers to buffer_heads of
- * partially truncated blocks - in @chain[].bh and pointers to
- * their last elements that should not be removed - in
- * @chain[].p. Return value is the pointer to last filled element
- * of @chain.
- *
- * The work left to caller to do the actual freeing of subtrees:
- * a) free the subtree starting from *@top
- * b) free the subtrees whose roots are stored in
- * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- * c) free the subtrees growing from the inode past the @chain[0].
- * (no partially truncated stuff there). */
-
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
- ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- /* Make k index the deepest non-null offest + 1 */
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = ext4_get_branch(inode, k, offsets, chain, &err);
- /* Writer: pointers */
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p)
- /* Writer: end */
- goto no_top;
- for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- /* Nope, don't do this in ext4. Must leave the tree intact */
-#if 0
- *p->p = 0;
-#endif
- }
- /* Writer: end */
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
+
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
+
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
- while(partial > p) {
- brelse(partial->bh);
- partial--;
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
}
-no_top:
- return partial;
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
+ return err;
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
}
/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
*
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
+ * Returns: 0 on success or negative on failure
*/
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first, __le32 *last)
-{
- __le32 *p;
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, bh);
- }
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- ext4_journal_get_write_access(handle, bh);
- }
- }
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_block_offset, last_block_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ trace_ext4_punch_hole(inode, offset, length, 0);
/*
- * Any buffers which are on the journal will be in memory. We find
- * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
- * on them. We've already detached each block from the file, so
- * bforget() in jbd2_journal_forget() should be safe.
- *
- * AKPM: turn on bforget in jbd2_journal_forget()!!!
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
*/
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *tbh;
-
- *p = 0;
- tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
- }
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
}
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
-}
+ mutex_lock(&inode->i_mutex);
-/**
- * ext4_free_data - free a list of data blocks
- * @handle: handle for this transaction
- * @inode: inode we are dealing with
- * @this_bh: indirect buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: points immediately past the end of array
- *
- * We are freeing all blocks refered from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free. Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext4_free_data(handle_t *handle, struct inode *inode,
- struct buffer_head *this_bh,
- __le32 *first, __le32 *last)
-{
- ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
- __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
- corresponding to
- block_to_free */
- ext4_fsblk_t nr; /* Current block # */
- __le32 *p; /* Pointer into inode/ind
- for current block */
- int err;
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
- if (this_bh) { /* For indirect block */
- BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
- /* Important: if we can't update the indirect pointers
- * to the blocks, we can't free them. */
- if (err)
- return;
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
}
- for (p = first; p < last; p++) {
- nr = le32_to_cpu(*p);
- if (nr) {
- /* accumulate blocks to free if they're contiguous */
- if (count == 0) {
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- } else if (nr == block_to_free + count) {
- count++;
- } else {
- ext4_clear_blocks(handle, inode, this_bh,
- block_to_free,
- count, block_to_free_p, p);
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- }
- }
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
+ /*
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
+ */
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
}
- if (count > 0)
- ext4_clear_blocks(handle, inode, this_bh, block_to_free,
- count, block_to_free_p, p);
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
}
-}
-/**
- * ext4_free_branches - free an array of branches
- * @handle: JBD handle for this transaction
- * @inode: inode we are dealing with
- * @parent_bh: the buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: pointer immediately past the end of array
- * @depth: depth of the branches to free
- *
- * We are freeing all blocks refered from these branches (numbers are
- * stored as little-endian 32-bit) and updating @inode->i_blocks
- * appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
- struct buffer_head *parent_bh,
- __le32 *first, __le32 *last, int depth)
-{
- ext4_fsblk_t nr;
- __le32 *p;
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
- if (is_handle_aborted(handle))
- return;
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- if (depth--) {
- struct buffer_head *bh;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- p = last;
- while (--p >= first) {
- nr = le32_to_cpu(*p);
- if (!nr)
- continue; /* A hole */
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
- /* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- /*
- * A read failure? Report error and clear slot
- * (should be rare).
- */
- if (!bh) {
- ext4_error(inode->i_sb, "ext4_free_branches",
- "Read failure, inode=%lu, block=%llu",
- inode->i_ino, nr);
- continue;
- }
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
- /* This zaps the entire block. Bottom up. */
- BUFFER_TRACE(bh, "free child branches");
- ext4_free_branches(handle, inode, bh,
- (__le32*)bh->b_data,
- (__le32*)bh->b_data + addr_per_block,
- depth);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
- /*
- * We've probably journalled the indirect block several
- * times during the truncate. But it's no longer
- * needed and we now drop it from the transaction via
- * jbd2_journal_revoke().
- *
- * That's easy if it's exclusively part of this
- * transaction. But if it's part of the committing
- * transaction then jbd2_journal_forget() will simply
- * brelse() it. That means that if the underlying
- * block is reallocated in ext4_get_block(),
- * unmap_underlying_metadata() will find this block
- * and will try to get rid of it. damn, damn.
- *
- * If this block has already been committed to the
- * journal, a revoke record will be written. And
- * revoke records must be emitted *before* clearing
- * this block's bit in the bitmaps.
- */
- ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
- /*
- * Everything below this this pointer has been
- * released. Now let this top-of-subtree go.
- *
- * We want the freeing of this indirect block to be
- * atomic in the journal with the updating of the
- * bitmap block which owns it. So make some room in
- * the journal.
- *
- * We zero the parent pointer *after* freeing its
- * pointee in the bitmaps, so if extend_transaction()
- * for some reason fails to put the bitmap changes and
- * the release into the same transaction, recovery
- * will merely complain about releasing a free block,
- * rather than leaking blocks.
- */
- if (is_handle_aborted(handle))
- return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
- }
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- ext4_free_blocks(handle, inode, nr, 1, 1);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
- if (parent_bh) {
- /*
- * The block which we have just freed is
- * pointed to by an indirect block: journal it
- */
- BUFFER_TRACE(parent_bh, "get_write_access");
- if (!ext4_journal_get_write_access(handle,
- parent_bh)){
- *p = 0;
- BUFFER_TRACE(parent_bh,
- "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle,
- parent_bh);
- }
- }
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+ return 0;
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
}
- } else {
- /* We have reached the bottom of the tree. */
- BUFFER_TRACE(parent_bh, "free data blocks");
- ext4_free_data(handle, inode, parent_bh, first, last);
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
}
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
+ return 0;
}
/*
@@ -2300,7 +3679,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* transaction, and VFS/VM ensures that ext4_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -2323,73 +3702,61 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
void ext4_truncate(struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *i_data = ei->i_data;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ unsigned int credits;
+ handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- __le32 nr = 0;
- int n;
- ext4_lblk_t last_block;
- unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
/*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
+ * There is a possibility that we're either freeing the inode
+ * or it's a completely new inode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
*/
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ trace_ext4_truncate_enter(inode);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ if (!ext4_can_truncate(inode))
return;
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
}
- handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
- return; /* AKPM: return what? */
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
}
- last_block = (inode->i_size + blocksize-1)
- >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
- n = ext4_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
*
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
@@ -2397,96 +3764,23 @@ void ext4_truncate(struct inode *inode)
if (ext4_orphan_add(handle, inode))
goto out_stop;
- /*
- * The orphan list entry will now protect us from any crash which
- * occurs before the truncate completes, so it is now safe to propagate
- * the new, shorter inode size (held for now in i_size) into the
- * on-disk inode. We do this via i_disksize, which is the value which
- * ext4 *really* writes onto the disk inode.
- */
- ei->i_disksize = inode->i_size;
-
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
+ down_write(&EXT4_I(inode)->i_data_sem);
- if (n == 1) { /* direct blocks */
- ext4_free_data(handle, inode, NULL, i_data+offsets[0],
- i_data + EXT4_NDIR_BLOCKS);
- goto do_indirects;
- }
+ ext4_discard_preallocations(inode);
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (not detached) */
- if (nr) {
- if (partial == chain) {
- /* Shared branch grows from the inode */
- ext4_free_branches(handle, inode, NULL,
- &nr, &nr+1, (chain+n-1) - partial);
- *partial->p = 0;
- /*
- * We mark the inode dirty prior to restart,
- * and prior to stop. No need for it here.
- */
- } else {
- /* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
- ext4_free_branches(handle, inode, partial->bh,
- partial->p,
- partial->p+1, (chain+n-1) - partial);
- }
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
- (__le32*)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees */
- switch (offsets[0]) {
- default:
- nr = i_data[EXT4_IND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
- i_data[EXT4_IND_BLOCK] = 0;
- }
- case EXT4_IND_BLOCK:
- nr = i_data[EXT4_DIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
- i_data[EXT4_DIND_BLOCK] = 0;
- }
- case EXT4_DIND_BLOCK:
- nr = i_data[EXT4_TIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
- i_data[EXT4_TIND_BLOCK] = 0;
- }
- case EXT4_TIND_BLOCK:
- ;
- }
-
- ext4_discard_reservation(inode);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(handle, inode);
+ else
+ ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
+
out_stop:
/*
- * If this was a simple ftruncate(), and the file will remain alive
+ * If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
* ext4_delete_inode(), and we allow that function to clean up the
@@ -2495,56 +3789,11 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
-}
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
- unsigned long ino, struct ext4_iloc *iloc)
-{
- unsigned long desc, group_desc;
- ext4_group_t block_group;
- unsigned long offset;
- ext4_fsblk_t block;
- struct buffer_head *bh;
- struct ext4_group_desc * gdp;
-
- if (!ext4_valid_inum(sb, ino)) {
- /*
- * This error is already checked for in namei.c unless we are
- * looking at an NFS filehandle, in which case no error
- * report is needed
- */
- return 0;
- }
-
- block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
- if (block_group >= EXT4_SB(sb)->s_groups_count) {
- ext4_error(sb,"ext4_get_inode_block","group >= groups count");
- return 0;
- }
- smp_rmb();
- group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
- desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- bh = EXT4_SB(sb)->s_group_desc[group_desc];
- if (!bh) {
- ext4_error (sb, "ext4_get_inode_block",
- "Descriptor not loaded");
- return 0;
- }
-
- gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
- desc * EXT4_DESC_SIZE(sb));
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
- EXT4_INODE_SIZE(sb);
- block = ext4_inode_table(sb, gdp) +
- (offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
- iloc->block_group = block_group;
- iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
- return block;
+ trace_ext4_truncate_exit(inode);
}
/*
@@ -2556,23 +3805,45 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
static int __ext4_get_inode_loc(struct inode *inode,
struct ext4_iloc *iloc, int in_mem)
{
- ext4_fsblk_t block;
- struct buffer_head *bh;
-
- block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
- if (!block)
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t block;
+ int inodes_per_block, inode_offset;
+
+ iloc->bh = NULL;
+ if (!ext4_valid_inum(sb, inode->i_ino))
return -EIO;
- bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
- ext4_error (inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+ if (!gdp)
return -EIO;
- }
+
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ inode_offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(sb));
+ block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+ iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+ bh = sb_getblk(sb, block);
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
@@ -2586,29 +3857,13 @@ static int __ext4_get_inode_loc(struct inode *inode,
*/
if (in_mem) {
struct buffer_head *bitmap_bh;
- struct ext4_group_desc *desc;
- int inodes_per_buffer;
- int inode_offset, i;
- ext4_group_t block_group;
- int start;
-
- block_group = (inode->i_ino - 1) /
- EXT4_INODES_PER_GROUP(inode->i_sb);
- inodes_per_buffer = bh->b_size /
- EXT4_INODE_SIZE(inode->i_sb);
- inode_offset = ((inode->i_ino - 1) %
- EXT4_INODES_PER_GROUP(inode->i_sb));
- start = inode_offset & ~(inodes_per_buffer - 1);
+ int i, start;
- /* Is the inode bitmap in cache? */
- desc = ext4_get_group_desc(inode->i_sb,
- block_group, NULL);
- if (!desc)
- goto make_io;
+ start = inode_offset & ~(inodes_per_block - 1);
- bitmap_bh = sb_getblk(inode->i_sb,
- ext4_inode_bitmap(inode->i_sb, desc));
- if (!bitmap_bh)
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -2620,14 +3875,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
brelse(bitmap_bh);
goto make_io;
}
- for (i = start; i < start + inodes_per_buffer; i++) {
+ for (i = start; i < start + inodes_per_block; i++) {
if (i == inode_offset)
continue;
if (ext4_test_bit(i, bitmap_bh->b_data))
break;
}
brelse(bitmap_bh);
- if (i == start + inodes_per_buffer) {
+ if (i == start + inodes_per_block) {
/* all other inodes are free, so skip I/O */
memset(bh->b_data, 0, bh->b_size);
set_buffer_uptodate(bh);
@@ -2638,19 +3893,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
make_io:
/*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
+ if (table > b)
+ b = table;
+ end = b + ra_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (ext4_has_group_desc_csum(sb))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ sb_breadahead(sb, b++);
+ }
+
+ /*
* There are other valid inodes in the buffer, this inode
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ EXT4_ERROR_INODE_BLOCK(inode, block,
+ "unable to read itable block");
brelse(bh);
return -EIO;
}
@@ -2664,46 +3943,55 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
void ext4_get_inode_flags(struct ext4_inode_info *ei)
{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT4_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT4_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT4_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT4_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT4_DIRSYNC_FL;
+ unsigned int vfs_fl;
+ unsigned long old_fl, new_fl;
+
+ do {
+ vfs_fl = ei->vfs_inode.i_flags;
+ old_fl = ei->i_flags;
+ new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+ EXT4_DIRSYNC_FL);
+ if (vfs_fl & S_SYNC)
+ new_fl |= EXT4_SYNC_FL;
+ if (vfs_fl & S_APPEND)
+ new_fl |= EXT4_APPEND_FL;
+ if (vfs_fl & S_IMMUTABLE)
+ new_fl |= EXT4_IMMUTABLE_FL;
+ if (vfs_fl & S_NOATIME)
+ new_fl |= EXT4_NOATIME_FL;
+ if (vfs_fl & S_DIRSYNC)
+ new_fl |= EXT4_DIRSYNC_FL;
+ } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
}
+
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
+ struct ext4_inode_info *ei)
{
blkcnt_t i_blocks ;
struct inode *inode = &(ei->vfs_inode);
@@ -2714,7 +4002,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
- if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
/* i_blocks represent file system block size */
return i_blocks << (inode->i_blkbits - 9);
} else {
@@ -2725,15 +4013,30 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -2742,27 +4045,58 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
- ei->i_acl = EXT4_ACL_NOT_CACHED;
- ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
- ei->i_block_alloc_info = NULL;
+ iloc.bh = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ ret = -EIO;
+ goto bad_inode;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = raw_inode->i_generation;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid");
+ ret = -EIO;
+ goto bad_inode;
+ }
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
- if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ei->i_state = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -2771,30 +4105,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
- brelse (bh);
ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
- }
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
@@ -2803,40 +4141,79 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- brelse (bh);
- ret = -EIO;
- goto bad_inode;
- }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
- } else
- ei->i_extra_isize = 0;
+ }
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
+ ret = 0;
+ if (ei->i_file_acl &&
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ei->i_file_acl);
+ ret = -EIO;
+ goto bad_inode;
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
+ }
+ if (ret)
+ goto bad_inode;
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
@@ -2845,13 +4222,16 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode))
+ if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
@@ -2859,13 +4239,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
+ } else {
+ ret = -EIO;
+ EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+ goto bad_inode;
}
- brelse (iloc.bh);
+ brelse(iloc.bh);
ext4_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -2877,46 +4264,36 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
- int err = 0;
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
- } else if (i_blocks <= 0xffffffffffffULL) {
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- /* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
} else {
- /*
- * i_blocks should be represented in a 48 bit variable
- * as multiple of file system block size
- */
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- ei->i_flags |= EXT4_HUGE_FILE_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
-err_out:
- return err;
+ return 0;
}
/*
@@ -2933,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
+ int need_datasync = 0, set_large_file = 0;
+ uid_t i_uid;
+ gid_t i_gid;
+
+ spin_lock(&ei->i_raw_lock);
- /* For fields not not tracking in the in-memory inode,
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
- if(!ei->i_dtime) {
+ if (!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -2973,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -3017,26 +4389,45 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
+ ext4_inode_csum_set(inode, raw_inode, ei);
+
+ spin_unlock(&ei->i_raw_lock);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- rc = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT4_STATE_NEW;
-
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
- brelse (bh);
+ brelse(bh);
ext4_std_error(inode->i_sb, err);
return err;
}
@@ -3046,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3072,25 +4462,95 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ int err;
+
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
+
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
+
+ err = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (err)
+ return err;
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+ "IO error syncing inode");
+ err = -EIO;
+ }
+ brelse(iloc.bh);
}
+ return err;
+}
- if (!wait)
- return 0;
+/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
- return ext4_force_commit(inode->i_sb);
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
}
/*
@@ -3108,31 +4568,42 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
int error, rc = 0;
+ int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext4_journal_stop(handle);
return error;
@@ -3147,45 +4618,97 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes) {
- error = -EFBIG;
- goto err_out;
- }
+ if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ return -EFBIG;
}
- }
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
- handle_t *handle;
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error)
+ goto err_out;
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ /*
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
+ */
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
- error = ext4_orphan_add(handle, inode);
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
+ }
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, inode->i_size);
}
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
- rc = inode_setattr(inode, attr);
+ if (!rc) {
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ }
- /* If inode_setattr's call to ext4_truncate failed to get a
- * transaction handle at all, we need to clean up the in-core
- * orphan list manually. */
- if (inode->i_nlink)
+ /*
+ * If the call to ext4_truncate failed to get a transaction handle at
+ * all, we need to clean up the in-core orphan list manually.
+ */
+ if (orphan && inode->i_nlink)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -3194,67 +4717,142 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
+ return 0;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
+{
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
+}
/*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances. If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched. Pah.
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
*/
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
+{
+ ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+ int gdpblocks;
+ int idxblocks;
+ int ret = 0;
+
+ /*
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
+ */
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
+
+ ret = idxblocks;
+ /*
+ * Now let's see how many group bitmaps and group descriptors need
+ * to account
+ */
+ groups = idxblocks + pextents;
+ gdpblocks = groups;
+ if (groups > ngroups)
+ groups = ngroups;
+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+ /* bitmaps and block group descriptor blocks */
+ ret += groups + gdpblocks;
+
+ /* Blocks for super block, inode, quota and xattr blocks */
+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
int ext4_writepage_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_page(inode);
- int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
int ret;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_writepage_trans_blocks(inode, bpp);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
+ /* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret = 3 * (bpp + indirects) + 2;
- else
- ret = 2 * (bpp + indirects) + 2;
-
-#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during DQUOT_INIT so
- * we will be updating only the data blocks + inodes */
- ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
+ ret += bpp;
return ret;
}
/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
* The caller must have previously called ext4_reserve_inode_write().
* Give this, we know that the caller already has write access to iloc->bh.
*/
int ext4_mark_iloc_dirty(handle_t *handle,
- struct inode *inode, struct ext4_iloc *iloc)
+ struct inode *inode, struct ext4_iloc *iloc)
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
+ if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
@@ -3275,16 +4873,15 @@ int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc)
{
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, iloc);
- if (!err) {
- BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
- if (err) {
- brelse(iloc->bh);
- iloc->bh = NULL;
- }
+ int err;
+
+ err = ext4_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
}
}
ext4_std_error(inode->i_sb, err);
@@ -3302,7 +4899,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_entry *entry;
if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
return 0;
@@ -3310,11 +4906,10 @@ static int ext4_expand_extra_isize(struct inode *inode,
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -3338,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -3355,9 +4942,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err, ret;
might_sleep();
+ trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ if (ext4_handle_valid(handle) &&
+ EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -3371,10 +4960,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
- ext4_warning(inode->i_sb, __FUNCTION__,
+ ext4_warning(inode->i_sb,
"Unable to expand inode %lu. Delete"
" some EAs or run e2fsck.",
inode->i_ino);
@@ -3396,31 +4986,23 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __FUNCTION__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
@@ -3445,8 +5027,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
BUFFER_TRACE(iloc.bh, "get_write_access");
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
- err = ext4_journal_dirty_metadata(handle,
- iloc.bh);
+ err = ext4_handle_dirty_metadata(handle,
+ NULL,
+ iloc.bh);
brelse(iloc.bh);
}
}
@@ -3472,11 +5055,27 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
journal = EXT4_JOURNAL(inode);
+ if (!journal)
+ return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -3487,23 +5086,119 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
- else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ else {
+ jbd2_journal_flush(journal);
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_mark_inode_dirty(handle, inode);
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
ext4_journal_stop(handle);
ext4_std_error(inode->i_sb, err);
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ loff_t size;
+ unsigned long len;
+ int ret;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
+ }
+
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ /*
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
+ */
+ if (page_has_buffers(page)) {
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
+ /* Wait so that we don't change page under IO */
+ wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
+ }
+ }
+ unlock_page(page);
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}