aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c5427
1 files changed, 2378 insertions, 3049 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bdbe6990220..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,17 +12,12 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
- * Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*
* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
@@ -39,110 +34,124 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/printk.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/aio.h>
+#include <linux/bitops.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
+#include "truncate.h"
#include <trace/events/ext4.h>
#define MPAGE_DA_EXTENT_TAIL 0x01
-static inline int ext4_begin_ordered_truncate(struct inode *inode,
- loff_t new_size)
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- trace_ext4_begin_ordered_truncate(inode, new_size);
- return jbd2_journal_begin_ordered_truncate(
- EXT4_SB(inode->i_sb)->s_journal,
- &EXT4_I(inode)->jinode,
- new_size);
-}
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u16 csum_lo;
+ __u16 csum_hi = 0;
+ __u32 csum;
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+ csum_lo = le16_to_cpu(raw->i_checksum_lo);
+ raw->i_checksum_lo = 0;
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+ csum_hi = le16_to_cpu(raw->i_checksum_hi);
+ raw->i_checksum_hi = 0;
+ }
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
-{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+ csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+ EXT4_INODE_SIZE(inode->i_sb));
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+ raw->i_checksum_lo = cpu_to_le16(csum_lo);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum_hi);
+
+ return csum;
}
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- ext4_lblk_t needed;
-
- needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+ __u32 provided, calculated;
- /* Give ourselves just enough room to cope with inodes in which
- * i_blocks is corrupt: we've seen disk corruptions in the past
- * which resulted in random data in an inode which looked enough
- * like a regular file for ext4 to try to delete it. Things
- * will go a bit crazy if that happens, but at least we should
- * try not to panic the whole kernel. */
- if (needed < 2)
- needed = 2;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 1;
- /* But we need to bound the transaction so we don't overflow the
- * journal. */
- if (needed > EXT4_MAX_TRANS_DATA)
- needed = EXT4_MAX_TRANS_DATA;
+ provided = le16_to_cpu(raw->i_checksum_lo);
+ calculated = ext4_inode_csum(inode, raw, ei);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+ else
+ calculated &= 0xFFFF;
- return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+ return provided == calculated;
}
-/*
- * Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
- handle_t *result;
+ __u32 csum;
- result = ext4_journal_start(inode, blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_LINUX) ||
+ !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
- ext4_std_error(inode->i_sb, PTR_ERR(result));
- return result;
+ csum = ext4_inode_csum(inode, raw, ei);
+ raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+ raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ trace_ext4_begin_ordered_truncate(inode, new_size);
+ /*
+ * If jinode is zero, then we never opened the file for
+ * writing, so there's no need to call
+ * jbd2_journal_begin_ordered_truncate() since there's no
+ * outstanding writes we need to flush.
+ */
+ if (!EXT4_I(inode)->jinode)
+ return 0;
+ return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
+
/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * Test whether an inode is a fast symlink.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
return 0;
- return 1;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
/*
@@ -164,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ ret = ext4_journal_restart(handle, nblocks);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -180,8 +189,38 @@ void ext4_evict_inode(struct inode *inode)
int err;
trace_ext4_evict_inode(inode);
+
if (inode->i_nlink) {
- truncate_inode_pages(&inode->i_data, 0);
+ /*
+ * When journalling data dirty buffers are tracked only in the
+ * journal. So although mm thinks everything is clean and
+ * ready for reaping the inode might still have some pages to
+ * write in the running transaction or waiting to be
+ * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * (via truncate_inode_pages()) to discard these buffers can
+ * cause data loss. Also even if we did not discard these
+ * buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to
+ * read them before the transaction is checkpointed. So be
+ * careful and force everything to disk here... We use
+ * ei->i_datasync_tid to store the newest transaction
+ * containing inode's data.
+ *
+ * Note that directories do not have this problem because they
+ * don't use page cache.
+ */
+ if (ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT4_JOURNAL_INO) {
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+ jbd2_complete_transaction(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -190,12 +229,19 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
- handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -204,6 +250,7 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -235,6 +282,7 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
}
@@ -263,786 +311,12 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
-typedef struct {
- __le32 *p;
- __le32 key;
- struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-/**
- * ext4_block_to_path - parse the block number into array of offsets
- * @inode: inode in question (we are only interested in its superblock)
- * @i_block: block number to be parsed
- * @offsets: array to store the offsets in
- * @boundary: set this non-zero if the referred-to block is likely to be
- * followed (on disk) by an indirect block.
- *
- * To store the locations of file's data ext4 uses a data structure common
- * for UNIX filesystems - tree of pointers anchored in the inode, with
- * data blocks at leaves and indirect blocks in intermediate nodes.
- * This function translates the block number into path in that tree -
- * return value is the path length and @offsets[n] is the offset of
- * pointer to (n+1)th node in the nth one. If @block is out of range
- * (negative or too large) warning is printed and zero returned.
- *
- * Note: function doesn't find node addresses, so no IO is needed. All
- * we need to know is the capacity of indirect blocks (taken from the
- * inode->i_sb).
- */
-
-/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
- */
-
-static int ext4_block_to_path(struct inode *inode,
- ext4_lblk_t i_block,
- ext4_lblk_t offsets[4], int *boundary)
-{
- int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
- const long direct_blocks = EXT4_NDIR_BLOCKS,
- indirect_blocks = ptrs,
- double_blocks = (1 << (ptrs_bits * 2));
- int n = 0;
- int final = 0;
-
- if (i_block < direct_blocks) {
- offsets[n++] = i_block;
- final = direct_blocks;
- } else if ((i_block -= direct_blocks) < indirect_blocks) {
- offsets[n++] = EXT4_IND_BLOCK;
- offsets[n++] = i_block;
- final = ptrs;
- } else if ((i_block -= indirect_blocks) < double_blocks) {
- offsets[n++] = EXT4_DIND_BLOCK;
- offsets[n++] = i_block >> ptrs_bits;
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
- offsets[n++] = EXT4_TIND_BLOCK;
- offsets[n++] = i_block >> (ptrs_bits * 2);
- offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else {
- ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
- i_block + direct_blocks +
- indirect_blocks + double_blocks, inode->i_ino);
- }
- if (boundary)
- *boundary = final - 1 - (i_block & (ptrs - 1));
- return n;
-}
-
-static int __ext4_check_blockref(const char *function, unsigned int line,
- struct inode *inode,
- __le32 *p, unsigned int max)
-{
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- __le32 *bref = p;
- unsigned int blk;
-
- while (bref < p+max) {
- blk = le32_to_cpu(*bref++);
- if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
- es->s_last_error_block = cpu_to_le64(blk);
- ext4_error_inode(inode, function, line, blk,
- "invalid block");
- return -EIO;
- }
- }
- return 0;
-}
-
-
-#define ext4_check_indirect_blockref(inode, bh) \
- __ext4_check_blockref(__func__, __LINE__, inode, \
- (__le32 *)(bh)->b_data, \
- EXT4_ADDR_PER_BLOCK((inode)->i_sb))
-
-#define ext4_check_inode_blockref(inode) \
- __ext4_check_blockref(__func__, __LINE__, inode, \
- EXT4_I(inode)->i_data, \
- EXT4_NDIR_BLOCKS)
-
-/**
- * ext4_get_branch - read the chain of indirect blocks leading to data
- * @inode: inode in question
- * @depth: depth of the chain (1 - direct pointer, etc.)
- * @offsets: offsets of pointers in inode/indirect blocks
- * @chain: place to store the result
- * @err: here we store the error value
- *
- * Function fills the array of triples <key, p, bh> and returns %NULL
- * if everything went OK or the pointer to the last filled triple
- * (incomplete one) otherwise. Upon the return chain[i].key contains
- * the number of (i+1)-th block in the chain (as it is stored in memory,
- * i.e. little-endian 32-bit), chain[i].p contains the address of that
- * number (it points into struct inode for i==0 and into the bh->b_data
- * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- * block for i>0 and NULL for i==0. In other words, it holds the block
- * numbers of the chain, addresses they were taken from (and where we can
- * verify that chain did not change) and buffer_heads hosting these
- * numbers.
- *
- * Function stops when it stumbles upon zero pointer (absent block)
- * (pointer to last triple returned, *@err == 0)
- * or when it gets an IO error reading an indirect block
- * (ditto, *@err == -EIO)
- * or when it reads all @depth-1 indirect blocks successfully and finds
- * the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
- ext4_lblk_t *offsets,
- Indirect chain[4], int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- /* i_data is not going away, no lock needed */
- add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
- if (unlikely(!bh))
- goto failure;
-
- if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
- put_bh(bh);
- goto failure;
- }
- /* validate block references */
- if (ext4_check_indirect_blockref(inode, bh)) {
- put_bh(bh);
- goto failure;
- }
- }
-
- add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
- /* Reader: end */
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-/**
- * ext4_find_near - find a place for allocation with sufficient locality
- * @inode: owner
- * @ind: descriptor of indirect block.
- *
- * This function returns the preferred place for block allocation.
- * It is used when heuristic for sequential allocation fails.
- * Rules are:
- * + if there is a block to the left of our position - allocate near it.
- * + if pointer will live in indirect block - allocate near that block.
- * + if pointer will live in inode - allocate in the same
- * cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group. The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- * Caller must make sure that @ind is valid and will stay that way.
- */
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
- __le32 *p;
- ext4_fsblk_t bg_start;
- ext4_fsblk_t last_block;
- ext4_grpblk_t colour;
- ext4_group_t block_group;
- int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
-
- /* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--) {
- if (*p)
- return le32_to_cpu(*p);
- }
-
- /* No such thing, so let's try location of indirect block */
- if (ind->bh)
- return ind->bh->b_blocknr;
-
- /*
- * It is going to be referred to from the inode itself? OK, just put it
- * into the same cylinder group then.
- */
- block_group = ei->i_block_group;
- if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
- block_group &= ~(flex_size-1);
- if (S_ISREG(inode->i_mode))
- block_group++;
- }
- bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
- last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
- /*
- * If we are doing delayed allocation, we don't need take
- * colour into account.
- */
- if (test_opt(inode->i_sb, DELALLOC))
- return bg_start;
-
- if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
- (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
- return bg_start + colour;
-}
-
-/**
- * ext4_find_goal - find a preferred place for allocation.
- * @inode: owner
- * @block: block we want
- * @partial: pointer to the last triple within a chain
- *
- * Normally this function find the preferred place for block allocation,
- * returns it.
- * Because this is only used for non-extent files, we limit the block nr
- * to 32 bits.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
- Indirect *partial)
-{
- ext4_fsblk_t goal;
-
- /*
- * XXX need to get goal block from mballoc's data structures
- */
-
- goal = ext4_find_near(inode, partial);
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- return goal;
-}
-
-/**
- * ext4_blks_to_allocate: Look up the block map and count the number
- * of direct blocks need to be allocated for the given branch.
- *
- * @branch: chain of indirect blocks
- * @k: number of blocks need for indirect blocks
- * @blks: number of data blocks to be mapped.
- * @blocks_to_boundary: the offset in the indirect block
- *
- * return the total number of blocks to be allocate, including the
- * direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
- int blocks_to_boundary)
-{
- unsigned int count = 0;
-
- /*
- * Simple case, [t,d]Indirect block(s) has not allocated yet
- * then it's clear blocks on that path have not allocated
- */
- if (k > 0) {
- /* right now we don't handle cross boundary allocation */
- if (blks < blocks_to_boundary + 1)
- count += blks;
- else
- count += blocks_to_boundary + 1;
- return count;
- }
-
- count++;
- while (count < blks && count <= blocks_to_boundary &&
- le32_to_cpu(*(branch[0].p + count)) == 0) {
- count++;
- }
- return count;
-}
-
-/**
- * ext4_alloc_blocks: multiple allocate blocks needed for a branch
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- *
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @blks: on return it will store the total number of allocated
- * direct blocks
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
-{
- struct ext4_allocation_request ar;
- int target, i;
- unsigned long count = 0, blk_allocated = 0;
- int index = 0;
- ext4_fsblk_t current_block = 0;
- int ret = 0;
-
- /*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
- */
- /* first we try to allocate the indirect blocks */
- target = indirect_blks;
- while (target > 0) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode,
- goal, &count, err);
- if (*err)
- goto failed_out;
-
- if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + count %lu > %d!",
- current_block, count,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
- if (count > 0) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- printk(KERN_INFO "%s returned more blocks than "
- "requested\n", __func__);
- WARN_ON(1);
- break;
- }
- }
-
- target = blks - count ;
- blk_allocated = count;
- if (!target)
- goto allocated;
- /* Now allocate data blocks */
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = target;
- ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
- /* enable in-core preallocation only for regular files */
- ar.flags = EXT4_MB_HINT_DATA;
-
- current_block = ext4_mb_new_blocks(handle, &ar, err);
- if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
- EXT4_ERROR_INODE(inode,
- "current_block %llu + ar.len %d > %d!",
- current_block, ar.len,
- EXT4_MAX_BLOCK_FILE_PHYS);
- *err = -EIO;
- goto failed_out;
- }
-
- if (*err && (target == blks)) {
- /*
- * if the allocation failed and we didn't allocate
- * any blocks before
- */
- goto failed_out;
- }
- if (!*err) {
- if (target == blks) {
- /*
- * save the new block number
- * for the first direct block
- */
- new_blocks[index] = current_block;
- }
- blk_allocated += ar.len;
- }
-allocated:
- /* total number of blocks allocated for direct blocks */
- ret = blk_allocated;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
- return ret;
-}
-
-/**
- * ext4_alloc_branch - allocate and set up a chain of blocks.
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
- *
- * This function allocates blocks, zeroes out all but the last one,
- * links them into chain and (if we are synchronous) writes them to disk.
- * In other words, it prepares a branch that can be spliced onto the
- * inode. It stores the information about that chain in the branch[], in
- * the same format as ext4_get_branch() would do. We are calling it after
- * we had read the existing part of chain and partial points to the last
- * triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext4_get_block(), except that in one
- * place chain is disconnected - *branch->p is still zero (we did not
- * set the last link), but branch->key contains the number that should
- * be placed into *branch->p to fill that gap.
- *
- * If allocation fails we free all blocks we've allocated (and forget
- * their buffer_heads) and return the error value the from failed
- * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- * as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, int indirect_blks,
- int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext4_fsblk_t new_blocks[4];
- ext4_fsblk_t current_block;
-
- num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
- /*
- * metadata blocks and data blocks are allocated.
- */
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
- if (unlikely(!bh)) {
- err = -EIO;
- goto failed;
- }
-
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
- if (err) {
- /* Don't brelse(bh) here; it's done in
- * ext4_journal_forget() below */
- unlock_buffer(bh);
- goto failed;
- }
-
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if (n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i = 1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
-
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (err)
- goto failed;
- }
- *blks = num;
- return err;
-failed:
- /* Allocation failed, free what we already allocated */
- ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
- for (i = 1; i <= n ; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
- EXT4_FREE_BLOCKS_FORGET);
- }
- for (i = n+1; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-
- ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
-
- return err;
-}
-
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext4_alloc_branch)
- * @where: location of missing link
- * @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num,
- int blks)
-{
- int i;
- int err = 0;
- ext4_fsblk_t current_block;
-
- /*
- * If we're splicing into a [td]indirect block (as opposed to the
- * inode) then we need to get write access to the [td]indirect block
- * before the splice.
- */
- if (where->bh) {
- BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
- if (err)
- goto err_out;
- }
- /* That's it */
-
- *where->p = where->key;
-
- /*
- * Update the host buffer_head or inode to point to more just allocated
- * direct blocks blocks
- */
- if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
- *(where->p + i) = cpu_to_le32(current_block++);
- }
-
- /* We are done with atomic stuff, now do the rest of housekeeping */
- /* had we spliced it onto indirect block? */
- if (where->bh) {
- /*
- * If we spliced it onto an indirect block, we haven't
- * altered the inode. Note however that if it is being spliced
- * onto an indirect block at the very end of the file (the
- * file is growing) then we *will* alter the inode to reflect
- * the new i_size. But that is not done here - it is done in
- * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
- */
- jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, where->bh);
- if (err)
- goto err_out;
- } else {
- /*
- * OK, we spliced it into the inode itself on a direct block.
- */
- ext4_mark_inode_dirty(handle, inode);
- jbd_debug(5, "splicing direct\n");
- }
- return err;
-
-err_out:
- for (i = 1; i <= num; i++) {
- /*
- * branch[i].bh is newly allocated, so there is no
- * need to revoke the block, which is why we don't
- * need to set EXT4_FREE_BLOCKS_METADATA.
- */
- ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
- EXT4_FREE_BLOCKS_FORGET);
- }
- ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
- blks, 0);
-
- return err;
-}
-
-/*
- * The ext4_ind_map_blocks() function handles non-extents inodes
- * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_map_blocks().
- *
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- * The ext4_ind_get_blocks() function should be called with
- * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
- * blocks.
- */
-static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map,
- int flags)
-{
- int err = -EIO;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- ext4_fsblk_t goal;
- int indirect_blks;
- int blocks_to_boundary = 0;
- int depth;
- int count = 0;
- ext4_fsblk_t first_block = 0;
-
- J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
- J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
- depth = ext4_block_to_path(inode, map->m_lblk, offsets,
- &blocks_to_boundary);
-
- if (depth == 0)
- goto out;
-
- partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
- first_block = le32_to_cpu(chain[depth - 1].key);
- count++;
- /*map more blocks*/
- while (count < map->m_len && count <= blocks_to_boundary) {
- ext4_fsblk_t blk;
-
- blk = le32_to_cpu(*(chain[depth-1].p + count));
-
- if (blk == first_block + count)
- count++;
- else
- break;
- }
- goto got_it;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
- goto cleanup;
-
- /*
- * Okay, we need to do block allocation.
- */
- goal = ext4_find_goal(inode, map->m_lblk, partial);
-
- /* the number of blocks need to allocate for [d,t]indirect blocks */
- indirect_blks = (chain + depth) - partial - 1;
-
- /*
- * Next look up the indirect map to count the totoal number of
- * direct blocks to allocate for this branch.
- */
- count = ext4_blks_to_allocate(partial, indirect_blks,
- map->m_len, blocks_to_boundary);
- /*
- * Block out ext4_truncate while we alter the tree
- */
- err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
- &count, goal,
- offsets + (partial - chain), partial);
-
- /*
- * The ext4_splice_branch call will free and forget any buffers
- * on the new chain if there is a failure, but that risks using
- * up transaction credits, especially for bitmaps where the
- * credits cannot be returned. Can we handle this somehow? We
- * may need to return -EAGAIN upwards in the worst case. --sct
- */
- if (!err)
- err = ext4_splice_branch(handle, inode, map->m_lblk,
- partial, indirect_blks, count);
- if (err)
- goto cleanup;
-
- map->m_flags |= EXT4_MAP_NEW;
-
- ext4_update_inode_fsync_trans(handle, inode, 1);
-got_it:
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = le32_to_cpu(chain[depth-1].key);
- map->m_len = count;
- if (count > blocks_to_boundary)
- map->m_flags |= EXT4_MAP_BOUNDARY;
- err = count;
- /* Clean up and exit */
- partial = chain + depth - 1; /* the whole chain */
-cleanup:
- while (partial > chain) {
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
-out:
- return err;
-}
-
#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
@@ -1052,41 +326,14 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode,
- sector_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
- int blk_bits;
-
- if (lblock < EXT4_NDIR_BLOCKS)
- return 0;
-
- lblock -= EXT4_NDIR_BLOCKS;
-
- if (ei->i_da_metadata_calc_len &&
- (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
- ei->i_da_metadata_calc_len++;
- return 0;
- }
- ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
- ei->i_da_metadata_calc_len = 1;
- blk_bits = order_base_2(lblock);
- return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-
-/*
- * Calculate the number of metadata blocks need to reserve
* to allocate a block located at @lblock
*/
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_calc_metadata_amount(inode, lblock);
- return ext4_indirect_calc_metadata_amount(inode, lblock);
+ return ext4_ind_calc_metadata_amount(inode, lblock);
}
/*
@@ -1100,20 +347,31 @@ void ext4_da_update_reserve_space(struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
spin_lock(&ei->i_block_reservation_lock);
- trace_ext4_da_update_reserve_space(inode, used);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
- "with only %d reserved data blocks\n",
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
WARN_ON(1);
used = ei->i_reserved_data_blocks;
}
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+ }
+
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
used + ei->i_allocated_meta_blocks);
ei->i_allocated_meta_blocks = 0;
@@ -1123,7 +381,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* only when we have written all of the delayed
* allocation blocks.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
@@ -1132,14 +390,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update quota subsystem for data blocks */
if (quota_claim)
- dquot_claim_block(inode, used);
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
else {
/*
* We did fallocate with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
* not re-claim the quota for fallocated blocks.
*/
- dquot_release_reservation_block(inode, used);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
/*
@@ -1170,65 +428,57 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *es_map,
+ struct ext4_map_blocks *map,
+ int flags)
{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
+ int retval;
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
+ map->m_flags = 0;
+ /*
+ * There is a race window that the result is not the same.
+ * e.g. xfstests #223 when dioread_nolock enables. The reason
+ * is that we lookup a block mapping in extent status tree with
+ * out taking i_data_sem. So at the time the unwritten extent
+ * could be converted.
+ */
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+ /*
+ * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+ * because it shouldn't be marked in es_map->m_flags.
+ */
+ map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
+ /*
+ * We don't check m_len because extent will be collpased in status
+ * tree. So the m_len might not equal.
+ */
+ if (es_map->m_lblk != map->m_lblk ||
+ es_map->m_flags != map->m_flags ||
+ es_map->m_pblk != map->m_pblk) {
+ printk("ES cache assertion failed for inode: %lu "
+ "es_cached ex [%d/%d/%llu/%x] != "
+ "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+ inode->i_ino, es_map->m_lblk, es_map->m_len,
+ es_map->m_pblk, es_map->m_flags, map->m_lblk,
+ map->m_len, map->m_pblk, map->m_flags,
+ retval, flags);
}
- return num;
}
+#endif /* ES_AGGRESSIVE_TEST */
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
@@ -1242,39 +492,108 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
*
* It returns the error in case of allocation failure.
*/
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
+ int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
+
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
map->m_flags = 0;
ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+
+ /*
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
+ */
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(handle, inode, map,
+ &orig_map, flags);
+#endif
+ goto found;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0) {
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -1287,31 +606,31 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
- * ext4_ext_get_block() returns th create = 0
+ * ext4_ext_get_block() returns the create = 0
* with buffer head unmapped.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- return retval;
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
- * When we call get_blocks without the create flag, the
- * BH_Unwritten flag could have gotten set if the blocks
- * requested were part of a uninitialized extent. We need to
- * clear this flag now that we are committed to convert all or
- * part of the uninitialized extent to be an initialized
- * extent. This is because we need to avoid the combination
- * of BH_Unwritten and BH_Mapped flags being simultaneously
- * set on the buffer_head.
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
*/
- map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+ map->m_flags &= ~EXT4_MAP_FLAGS;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
/*
* if the caller is from delayed allocation writeout path
@@ -1320,7 +639,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* avoid double accounting
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+ ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -1350,11 +669,44 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_da_update_reserve_space(inode, retval, 1);
}
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+ if (retval > 0) {
+ unsigned int status;
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es))
+ goto has_zeroout;
+ }
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
+
+has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -1372,15 +724,19 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
int ret = 0, started = 0;
int dio_credits;
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, dio_credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
return ret;
@@ -1390,8 +746,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -1424,15 +784,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
- *errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
return NULL;
}
if (map.m_flags & EXT4_MAP_NEW) {
@@ -1479,7 +843,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return bh;
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1488,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers(handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)(handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -1526,11 +890,10 @@ static int walk_page_buffers(handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -1544,8 +907,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
int ret;
@@ -1562,23 +925,14 @@ static int do_journal_get_write_access(handle_t *handle,
*/
if (dirty)
clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
ret = ext4_journal_get_write_access(handle, bh);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
return ret;
}
-/*
- * Truncate blocks that were not used by write. We have to truncate the
- * pagecache as well so that corresponding buffers get properly unmapped.
- */
-static void ext4_truncate_failed_write(struct inode *inode)
-{
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ext4_truncate(inode);
-}
-
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1602,24 +956,45 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
-retry:
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
-
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle)) {
+ page_cache_release(page);
+ return PTR_ERR(handle);
+ }
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -1627,13 +1002,13 @@ retry:
ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
unlock_page(page);
- page_cache_release(page);
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -1657,37 +1032,70 @@ retry:
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
+ int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_handle_dirty_metadata(handle, NULL, bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ clear_buffer_meta(bh);
+ clear_buffer_prio(bh);
+ return ret;
}
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int i_size_changed = 0;
- struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ int i_size_changed = 0;
+
+ trace_ext4_write_end(inode, pos, len, copied);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto errout;
+ }
+ }
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
+ * cannot change under us because we hole i_mutex.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1697,10 +1105,10 @@ static int ext4_generic_write_end(struct file *file,
i_size_changed = 1;
}
- if (pos + copied > EXT4_I(inode)->i_disksize) {
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
+ * but greater than i_disksize. (hint delalloc)
*/
ext4_update_i_disksize(inode, (pos + copied));
i_size_changed = 1;
@@ -1717,83 +1125,13 @@ static int ext4_generic_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_ordered_write_end(inode, pos, len, copied);
- ret = ext4_jbd2_file_inode(handle, inode);
-
- if (ret == 0) {
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
- /* if we have allocated more blocks and copied
- * less. We will have blocks allocated outside
- * inode->i_size. So truncate them
- */
- ext4_orphan_add(handle, inode);
- if (ret2 < 0)
- ret = ret2;
- }
- ret2 = ext4_journal_stop(handle);
- if (!ret)
- ret = ret2;
-
- if (pos + len > inode->i_size) {
- ext4_truncate_failed_write(inode);
- /*
- * If truncate failed early the inode might still be
- * on the orphan list; we need to make sure the inode
- * is removed from the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
-
-
- return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = mapping->host;
- int ret = 0, ret2;
-
- trace_ext4_writeback_write_end(inode, pos, len, copied);
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-
- if (ret2 < 0)
- ret = ret2;
-
+errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1828,20 +1166,28 @@ static int ext4_journalled_write_end(struct file *file,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
+
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1876,48 +1222,96 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve a single block located at lblock
+ * Reserve a metadata for a single block located at lblock
*/
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long md_needed;
- int ret;
+ unsigned int md_needed;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
/*
* recalculate the amount of metadata blocks to reserve
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
- md_needed = ext4_calc_metadata_amount(inode, lblock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
trace_ext4_da_reserve_space(inode, md_needed);
+
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+ ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
/*
* We will charge metadata quota at writeout time; this saves
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, 1);
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
if (ret)
return ret;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+
/*
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
- if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- dquot_release_reservation_block(inode, 1);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- yield();
- goto repeat;
- }
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
- spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks++;
ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
@@ -1943,9 +1337,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* function is called from invalidate page, it's
* harmless to return without any action.
*/
- ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
"ino %lu, to_free %d with only %d reserved "
- "data blocks\n", inode->i_ino, to_free,
+ "data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
WARN_ON(1);
to_free = ei->i_reserved_data_blocks;
@@ -1957,195 +1351,94 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* We can release all of the reserved metadata blocks
* only when we have written all of the delayed
* allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
}
/* update fs dirty data blocks counter */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- dquot_release_reservation_block(inode, to_free);
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
+ int num_clusters;
+ ext4_fsblk_t lblk;
+
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
- ext4_da_release_space(page->mapping->host, to_release);
+
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
}
/*
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- int journal_data = ext4_should_journal_data(inode);
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, redirty_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- /*
- * If the page does not have buffers (for
- * whatever reason), try to create them using
- * __block_write_begin. If this fails,
- * redirty the page and move on.
- */
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(mpd->wbc,
- page);
- unlock_page(page);
- continue;
- }
- commit_write = 1;
- }
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (!bh)
- goto redirty_page;
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /* redirty page if block allocation undone */
- if (buffer_delay(bh) || buffer_unwritten(bh))
- redirty_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (redirty_page)
- goto redirty_page;
-
- if (commit_write)
- /* mark the buffer_heads as dirty & uptodate */
- block_commit_write(page, 0, len);
-
- /*
- * Delalloc doesn't support data journalling,
- * but eventually maybe we'll lift this
- * restriction.
- */
- if (unlikely(journal_data && PageChecked(page)))
- err = __ext4_journalled_writepage(page, len);
- else
- err = ext4_bio_write_page(&io_submit, page,
- len, mpd->wbc);
-
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
- sector_t logical, long blk_cnt)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
@@ -2153,9 +1446,20 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (logical + blk_cnt - 1) >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
+
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
+
+ pagevec_init(&pvec, 0);
while (index <= end) {
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
@@ -2166,350 +1470,204 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_CRIT "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_CRIT "Free/Dirty block details\n");
- printk(KERN_CRIT "free_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_CRIT "dirty_blocks=%lld\n",
- (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_CRIT "Block reservation details\n");
- printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(sb)));
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ ei->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+ ei->i_reserved_meta_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+ ei->i_allocated_meta_blocks);
return;
}
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+ return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
*/
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
+ struct extent_status es;
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+ struct ext4_map_blocks orig_map;
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the magic i_delalloc_reserved_flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+ memcpy(&orig_map, map, sizeof(*map));
+#endif
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will call
- * ext4_writepage() for all of the pages which will
- * just redirty the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
- if (err == -ENOSPC &&
- ext4_count_free_blocks(sb)) {
- mpd->retval = err;
- goto submit_io;
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ ext4_es_lru_add(inode);
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read(&EXT4_I(inode)->i_data_sem);
+ goto add_delayed;
}
/*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
*/
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost\n");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
}
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd, next,
- mpd->b_size >> mpd->inode->i_blkbits);
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err)
- /* This only happens if the journal is aborted */
- return;
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, size_t b_size,
- unsigned long b_state)
-{
- sector_t next;
- int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
- goto flush_it;
-
- /* check if thereserved journal credits might overflow */
- if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- EXT4_MAX_TRANS_DATA) {
- /*
- * Adding the new buffer_head would make it cross the
- * allowed limit for which we have journal credit
- * reserved. So limit the new bh->b_size
- */
- b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- mpd->inode->i_blkbits;
- /* we will do mpage_da_submit_io in the next loop */
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = b_size;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += b_size;
- return;
+#ifdef ES_AGGRESSIVE_TEST
+ ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+ return retval;
}
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
- return;
-}
-
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
-{
- return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-}
-
-/*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd)
-{
- struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head;
- sector_t logical;
-
/*
- * Can we merge this page to current extent?
+ * Try to see if we can get the block without requesting a new
+ * file system block.
*/
- if (mpd->next_page != page->index) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_has_inline_data(inode)) {
/*
- * Nope, we can't. So, we map non-allocated blocks
- * and start IO on them
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
*/
- if (mpd->next_page != mpd->first_page) {
- mpage_da_map_and_submit(mpd);
- /*
- * skip rest of the page in the page_vec
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
+ if (retval == 0) {
+ int ret;
/*
- * Start next extent of pages ...
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
*/
- mpd->first_page = page->index;
-
/*
- * ... and blocks
+ * If the block was allocated from previously allocated cluster,
+ * then we don't need to reserve it again. However we still need
+ * to reserve metadata for every block we're going to write.
*/
- mpd->b_size = 0;
- mpd->b_state = 0;
- mpd->b_blocknr = 0;
- }
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = ext4_da_reserve_metadata(inode, iblock);
+ if (ret) {
+ /* not enough space to reserve */
+ retval = ret;
+ goto out_unlock;
+ }
+ }
- mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
+ goto out_unlock;
+ }
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- if (mpd->io_done)
- return MPAGE_DA_EXTENT_TAIL;
- } else {
- /*
- * Page with regular buffer heads, just add all dirty ones
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
*/
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- if (mpd->io_done)
- return MPAGE_DA_EXTENT_TAIL;
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need to update
- * the b_state because we look at
- * b_state in mpage_da_map_blocks. We don't
- * update b_size because if we find an
- * unmapped buffer_head later we need to
- * use the b_state flag of that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned int status;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
}
- return 0;
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
}
/*
@@ -2524,15 +1682,11 @@ static int __mpage_da_writepage(struct page *page,
* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
* initialized properly.
*/
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
int ret = 0;
- sector_t invalid_block = ~((sector_t) 0xffff);
-
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -2545,25 +1699,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
return ret;
- if (ret == 0) {
- if (buffer_delay(bh))
- return 0; /* Not sure this could or should happen */
- /*
- * XXX: __block_write_begin() unmaps passed block, is it OK?
- */
- ret = ext4_da_reserve_space(inode, iblock);
- if (ret)
- /* not enough space to reserve */
- return ret;
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- return 0;
- }
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -2581,27 +1719,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks. It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling block_write_full_page(). Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
get_bh(bh);
@@ -2619,59 +1736,85 @@ static int __ext4_journalled_writepage(struct page *page,
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
ClearPageChecked(page);
- page_bufs = page_buffers(page);
- BUG_ON(!page_bufs);
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
/* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */
unlock_page(page);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
- ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
- err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
if (ret == 0)
ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ brelse(inode_bh);
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
* need to file the inode to the transaction's list in ordered mode because if
* we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
* transaction the data will hit the disk. In case we are journaling data, we
* cannot start transaction directly because transaction start ranks above page
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via pdflush (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
*
* We don't do any block allocation in this function. If we have page with
@@ -2682,7 +1825,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
* a[0] = 'a';
* truncate(f, 4096);
* we have in the page first buffer_head mapped via page_mkwrite call back
- * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
* do_wp_page). So writepage should write the first block. If we modify
* the mmap area beyond 1024 we will again get a page_fault and the
* page_mkwrite callback will do the block allocation and mark the
@@ -2702,48 +1845,45 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
- int ret = 0, commit_write = 0;
+ int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
- trace_ext4_writepage(inode, page);
+ trace_ext4_writepage(page);
size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
+ page_bufs = page_buffers(page);
/*
- * If the page does not have buffers (for whatever reason),
- * try to create them using __block_write_begin. If this
- * fails, redirty the page and move on.
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
*/
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(wbc, page);
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
unlock_page(page);
return 0;
}
- commit_write = 1;
+ keep_towrite = true;
}
- page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation, so redirty
- * the page and return. We may reach here when we do
- * a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list
- */
- goto redirty_page;
- }
- if (commit_write)
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
@@ -2752,82 +1892,456 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
-
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+ ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
+ *
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
+ }
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ int err;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
+ }
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Up to 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
+ pagevec_release(&pvec);
+ if (err > 0)
+ err = 0;
+ return err;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err, dioread_nolock;
+
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an unwritten extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
- * the pages).
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
*
- * This is a forked version of write_cache_pages(). Differences:
- * Range cyclic is ignored.
- * no_nrwrite_index_update is always presumed true
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- int ret = 0;
- int done = 0;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
+
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ do {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ } while (map->m_len);
+
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
struct pagevec pvec;
- unsigned nr_pages;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
- long nr_to_write = wbc->nr_to_write;
+ unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
- while (!done && (index <= end)) {
- int i;
-
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
+ while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- break;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2839,102 +2353,91 @@ static int write_cache_pages_da(struct address_space *mapping,
* mapping. However, page->index will not change
* because we have a reference on the page.
*/
- if (page->index > end) {
- done = 1;
- break;
- }
+ if (page->index > end)
+ goto out;
- *done_index = page->index + 1;
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
- lock_page(page);
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
+ lock_page(page);
/*
- * Page truncated or invalidated. We can freely skip it
- * then, even for data integrity operations: the page
- * has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
- * even if there is now a new, dirty page at the same
- * pagecache address.
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
- if (unlikely(page->mapping != mapping)) {
-continue_unlock:
+ if (!PageDirty(page) ||
+ (PageWriteback(page) &&
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+ unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
}
- if (!PageDirty(page)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
-
- if (PageWriteback(page)) {
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
- else
- goto continue_unlock;
- }
-
+ wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- if (!clear_page_dirty_for_io(page))
- goto continue_unlock;
-
- ret = __mpage_da_writepage(page, wbc, mpd);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
- ret = 0;
- } else {
- done = 1;
- break;
- }
- }
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
- }
+ if (mpd->map.m_len == 0)
+ mpd->first_page = page->index;
+ mpd->next_page = page->index + 1;
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ left--;
}
pagevec_release(&pvec);
cond_resched();
}
- return ret;
+ return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- long pages_skipped;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
+ struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2942,7 +2445,16 @@ static int ext4_da_writepages(struct address_space *mapping,
* because that could violate lock ordering on umount
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
+ goto out_writepages;
+
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ goto out_writepages;
+ }
/*
* If the filesystem has aborted, it is read-only, so return
@@ -2950,183 +2462,157 @@ static int ext4_da_writepages(struct address_space *mapping,
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
- return -EROFS;
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
+ }
+
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
mpd.wbc = wbc;
- mpd.inode = mapping->host;
-
- pages_skipped = wbc->pages_skipped;
-
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
- tag_pages_for_writeback(mapping, index, end);
-
- while (!ret && wbc->nr_to_write > 0) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
+ blk_start_plug(&plug);
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call __mpage_da_writepage to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4. We don't actually
- * submit the blocks for I/O here, even though
- * write_cache_pages thinks it will, and will set the
- * pages as clean for write before calling
- * __mpage_da_writepage().
- */
- mpd.b_size = 0;
- mpd.b_state = 0;
- mpd.b_blocknr = 0;
- mpd.first_page = 0;
- mpd.next_page = 0;
- mpd.io_done = 0;
- mpd.pages_written = 0;
- mpd.retval = 0;
- ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
- wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * got one extent now try with
- * rest of the pages
- */
- pages_written += mpd.pages_written;
- wbc->pages_skipped = pages_skipped;
- ret = 0;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
- if (!io_done && !cycled) {
+ blk_finish_plug(&plug);
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
- if (pages_skipped != wbc->pages_skipped)
- ext4_msg(inode->i_sb, KERN_CRIT,
- "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d",
- __func__, wbc->nr_to_write, ret);
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
-#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
- s64 free_blocks, dirty_blocks;
+ s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
@@ -3137,23 +2623,24 @@ static int ext4_nonda_switch(struct super_block *sb)
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
*/
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
- if (2 * free_blocks < 3 * dirty_blocks ||
- free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+ free_clusters =
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ dirty_clusters =
+ percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+ try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+ if (2 * free_clusters < 3 * dirty_clusters ||
+ free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
/*
* free block count is less than 150% of dirty blocks
* or free blocks is less than watermark
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb);
-
return 0;
}
@@ -3176,35 +2663,58 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len, flags);
-retry:
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
+ }
+
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
/*
* With delayed allocation, we don't log the i_disksize update
* if there is delayed block allocation. But we still need
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
- handle = ext4_journal_start(inode, 1);
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
- page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -3212,11 +2722,16 @@ retry:
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ *pagep = page;
return ret;
}
@@ -3255,17 +2770,9 @@ static int ext4_da_write_end(struct file *file,
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
- if (write_mode == FALL_BACK_TO_NONDELALLOC) {
- if (ext4_should_order_data(inode)) {
- return ext4_ordered_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else if (ext4_should_writeback_data(inode)) {
- return ext4_writeback_write_end(file, mapping, pos,
- len, copied, page, fsdata);
- } else {
- BUG();
- }
- }
+ if (write_mode == FALL_BACK_TO_NONDELALLOC)
+ return ext4_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
@@ -3276,22 +2783,13 @@ static int ext4_da_write_end(struct file *file,
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
-
new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
-
+ if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- }
up_write(&EXT4_I(inode)->i_data_sem);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
@@ -3300,8 +2798,16 @@ static int ext4_da_write_end(struct file *file,
ext4_mark_inode_dirty(handle, inode);
}
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
+ else
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
+
copied = ret2;
if (ret2 < 0)
ret = ret2;
@@ -3312,7 +2818,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -3321,10 +2828,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -3347,7 +2854,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -3359,10 +2866,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
* doing I/O at all.
*
* We could call write_cache_pages(), and then redirty all of
- * the pages by calling redirty_page_for_writeback() but that
+ * the pages by calling redirty_page_for_writepage() but that
* would be ugly in the extreme. So instead we would need to
* replicate parts of the code in the above functions,
- * simplifying them becuase we wouldn't actually intend to
+ * simplifying them because we wouldn't actually intend to
* write out the pages, but rather only collect contiguous
* logical block extents, call the multi-block allocator, and
* then update the buffer heads with the block allocations.
@@ -3394,6 +2901,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
/*
@@ -3439,63 +2952,77 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
- return mpage_readpage(page, ext4_get_block);
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
+ trace_ext4_readpage(page);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
+ trace_ext4_invalidatepage(page, offset, length);
- if (!page_has_buffers(page))
- return;
- head = bh = page_buffers(page);
- do {
- if (offset <= curr_off && test_clear_buffer_uninit(bh)
- && bh->b_private) {
- ext4_free_io_end(bh->b_private);
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- }
- curr_off = curr_off + bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset, length);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- /*
- * free any io_end structure allocated for buffers to be discarded
- */
- if (ext4_should_dioread_nolock(page->mapping->host))
- ext4_invalidatepage_free_endio(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- if (journal)
- jbd2_journal_invalidatepage(journal, page, offset);
- else
- block_invalidatepage(page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
+ trace_ext4_releasepage(page);
+
+ /* Page has dirty journalled data -> cannot release */
+ if (PageChecked(page))
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page, wait);
@@ -3504,119 +3031,11 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- handle_t *handle;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
- int retries = 0;
-
- if (rw == WRITE) {
- loff_t final_size = offset + count;
-
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext4_journal_stop(handle);
- }
- }
-
-retry:
- if (rw == READ && ext4_should_dioread_nolock(inode))
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL, NULL, 0);
- else {
- ret = blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL);
-
- if (unlikely((rw & WRITE) && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
-
- if (end > isize)
- vmtruncate(inode, isize);
- }
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
-
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-/*
* ext4_get_block used when preparing for a DIO write or buffer write.
* We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete.
*/
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -3625,116 +3044,33 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end)
+ return;
- ext_debug("ext4_end_io_dio(): io_end 0x%p"
- "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- iocb->private = NULL;
-out:
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
- io_end->offset = offset;
- io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
iocb->private = NULL;
-}
-
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
- ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
- struct inode *inode;
- unsigned long flags;
-
- if (!test_clear_buffer_uninit(bh) || !io_end)
- goto out;
-
- if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- printk("sb umounted, discard end_io request for inode %lu\n",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- goto out;
- }
-
- io_end->flag = EXT4_IO_END_UNWRITTEN;
- inode = io_end->inode;
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
-out:
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- clear_buffer_uninit(bh);
- end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
- loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- size_t size = bh->b_size;
-
-retry:
- io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- if (!io_end) {
- if (printk_ratelimit())
- printk(KERN_WARNING "%s: allocation fail\n", __func__);
- schedule();
- goto retry;
- }
io_end->offset = offset;
io_end->size = size;
- /*
- * We need to hold a reference to the page to make sure it
- * doesn't get evicted before ext4_end_io_work() has a chance
- * to convert the extent from written to unwritten.
- */
- io_end->page = page;
- get_page(io_end->page);
-
- bh->b_private = io_end;
- bh->b_end_io = ext4_end_io_buffer_write;
- return 0;
+ ext4_put_io_end(io_end);
}
/*
@@ -3742,13 +3078,13 @@ retry:
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
- * For holes, we fallocate those blocks, mark them as unintialized
- * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
*
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
* when async direct IO completed.
*
* If the O_DIRECT write will extend the file then add this inode to the
@@ -3757,107 +3093,166 @@ retry:
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- size_t count = iov_length(iov, nr_segs);
-
+ size_t count = iov_iter_count(iter);
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
loff_t final_size = offset + count;
- if (rw == WRITE && final_size <= inode->i_size) {
- /*
- * We could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as uninitialized
- * to prevent paralel buffered read to expose the stale data
- * before DIO complete the data IO.
- *
- * As to previously fallocated extents, ext4 get_block
- * will just simply mark the buffer mapped but still
- * keep the extents uninitialized.
- *
- * for non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
- *
- * for async DIO, the conversion needs to be defered when
- * the IO is completed. The ext4 end_io callback function
- * will be called to take care of the conversion work.
- * Here for async case, we allocate an io_end structure to
- * hook to the iocb.
- */
- iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
- if (!is_sync_kiocb(iocb)) {
- iocb->private = ext4_init_io_end(inode, GFP_NOFS);
- if (!iocb->private)
- return -ENOMEM;
- /*
- * we save the io structure for current async
- * direct IO, so that later ext4_map_blocks()
- * could flag the io structure whether there
- * is a unwritten extents needs to be converted
- * when IO is completed.
- */
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ ext4_io_end_t *io_end = NULL;
+
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
+
+ BUG_ON(iocb->private == NULL);
+
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
+
+ if (overwrite) {
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * unwritten to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents unwritten.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
}
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
- ret = blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio);
- if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ /*
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
+ */
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
/*
- * The io_end structure takes a reference to the inode,
- * that structure needs to be destroyed and the
- * reference to the inode need to be dropped, when IO is
- * complete, even with 0 byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will be
- * desctroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since
- * VFS direct IO won't invoke the end_io call back function,
- * we need to free the end_io structure here.
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ ext4_put_io_end(io_end);
iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the convertion right here
- */
- err = ext4_convert_unwritten_extents(inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- return ret;
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(NULL, inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
- /* for write the the end of file case, we fall back to old way */
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ return ret;
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+ else
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
+ return ret;
}
/*
@@ -3879,29 +3274,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
-static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
+static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
+ .write_end = ext4_write_end,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3915,13 +3294,14 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -3930,8 +3310,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
- .sync_page = block_sync_page,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3945,32 +3324,38 @@ static const struct address_space_operations ext4_da_aops = {
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
- inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_order_data(inode))
- inode->i_mapping->a_ops = &ext4_ordered_aops;
- else if (ext4_should_writeback_data(inode) &&
- test_opt(inode->i_sb, DELALLOC))
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
+ inode->i_mapping->a_ops = &ext4_journalled_aops;
+ return;
+ default:
+ BUG();
+ }
+ if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
- else if (ext4_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext4_writeback_aops;
else
- inode->i_mapping->a_ops = &ext4_journalled_aops;
+ inode->i_mapping->a_ops = &ext4_aops;
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3980,10 +3365,18 @@ int ext4_block_truncate_page(handle_t *handle,
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
mapping_gfp_mask(mapping) & ~__GFP_FS);
if (!page)
- return -EINVAL;
+ return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
@@ -3997,13 +3390,10 @@ int ext4_block_truncate_page(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- err = 0;
if (buffer_freed(bh)) {
BUFFER_TRACE(bh, "freed: skip");
goto unlock;
}
-
if (!buffer_mapped(bh)) {
BUFFER_TRACE(bh, "unmapped");
ext4_get_block(inode, iblock, bh, 0);
@@ -4026,25 +3416,22 @@ int ext4_block_truncate_page(handle_t *handle,
if (!buffer_uptodate(bh))
goto unlock;
}
-
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto unlock;
}
-
zero_user(page, offset, length);
-
BUFFER_TRACE(bh, "zeroed end of block");
- err = 0;
if (ext4_should_journal_data(inode)) {
err = ext4_handle_dirty_metadata(handle, inode, bh);
} else {
- if (ext4_should_order_data(inode))
- err = ext4_jbd2_file_inode(handle, inode);
+ err = 0;
mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
}
unlock:
@@ -4054,372 +3441,234 @@ unlock:
}
/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-static inline int all_zeroes(__le32 *p, __le32 *q)
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
-/**
- * ext4_find_shared - find the indirect blocks for partial truncation.
- * @inode: inode in question
- * @depth: depth of the affected branch
- * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
- * @chain: place to store the pointers to partial indirect blocks
- * @top: place to the (detached) top of branch
- *
- * This is a helper function used by ext4_truncate().
- *
- * When we do truncate() we may have to clean the ends of several
- * indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
- * from it (and it is on the path to the first completely truncated
- * data block, indeed). We have to free the top of that path along
- * with everything to the right of the path. Since no allocation
- * past the truncation point is possible until ext4_truncate()
- * finishes, we may safely do the latter, but top of branch may
- * require special attention - pageout below the truncation point
- * might try to populate it.
- *
- * We atomically detach the top of branch from the tree, store the
- * block number of its root in *@top, pointers to buffer_heads of
- * partially truncated blocks - in @chain[].bh and pointers to
- * their last elements that should not be removed - in
- * @chain[].p. Return value is the pointer to last filled element
- * of @chain.
- *
- * The work left to caller to do the actual freeing of subtrees:
- * a) free the subtree starting from *@top
- * b) free the subtrees whose roots are stored in
- * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- * c) free the subtrees growing from the inode past the @chain[0].
- * (no partially truncated stuff there). */
-
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
- ext4_lblk_t offsets[4], Indirect chain[4],
- __le32 *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- /* Make k index the deepest non-null offset + 1 */
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = ext4_get_branch(inode, k, offsets, chain, &err);
- /* Writer: pointers */
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p)
- /* Writer: end */
- goto no_top;
- for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- /* Nope, don't do this in ext4. Must leave the tree intact */
-#if 0
- *p->p = 0;
-#endif
- }
- /* Writer: end */
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- while (partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
-/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
- *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
- */
-static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh,
- ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first,
- __le32 *last)
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
{
- __le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
- EXT4_ERROR_INODE(inode, "attempt to clear invalid "
- "blocks %llu len %lu",
- (unsigned long long) block_to_free, count);
- return 1;
- }
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle, inode, bh);
- }
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- blocks_for_truncate(inode));
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- ext4_journal_get_write_access(handle, bh);
- }
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
}
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
+ return err;
+}
- for (p = first; p < last; p++)
- *p = 0;
-
- ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+int ext4_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
return 0;
}
-/**
- * ext4_free_data - free a list of data blocks
- * @handle: handle for this transaction
- * @inode: inode we are dealing with
- * @this_bh: indirect buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: points immediately past the end of array
- *
- * We are freeing all blocks refered from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
*
- * We accumulate contiguous runs of blocks to free. Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
*
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
+ * Returns: 0 on success or negative on failure
*/
-static void ext4_free_data(handle_t *handle, struct inode *inode,
- struct buffer_head *this_bh,
- __le32 *first, __le32 *last)
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
- __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
- corresponding to
- block_to_free */
- ext4_fsblk_t nr; /* Current block # */
- __le32 *p; /* Pointer into inode/ind
- for current block */
- int err;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t first_block_offset, last_block_offset;
+ handle_t *handle;
+ unsigned int credits;
+ int ret = 0;
- if (this_bh) { /* For indirect block */
- BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
- /* Important: if we can't update the indirect pointers
- * to the blocks, we can't free them. */
- if (err)
- return;
- }
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
- for (p = first; p < last; p++) {
- nr = le32_to_cpu(*p);
- if (nr) {
- /* accumulate blocks to free if they're contiguous */
- if (count == 0) {
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- } else if (nr == block_to_free + count) {
- count++;
- } else {
- if (ext4_clear_blocks(handle, inode, this_bh,
- block_to_free, count,
- block_to_free_p, p))
- break;
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- }
- }
+ trace_ext4_punch_hole(inode, offset, length, 0);
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + length - 1);
+ if (ret)
+ return ret;
}
- if (count > 0)
- ext4_clear_blocks(handle, inode, this_bh, block_to_free,
- count, block_to_free_p, p);
+ mutex_lock(&inode->i_mutex);
- if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
/*
- * The buffer head should have an attached journal head at this
- * point. However, if the data is corrupted and an indirect
- * block pointed to itself, it would have been detached when
- * the block was cleared. Check for this instead of OOPSing.
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
*/
- if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
- ext4_handle_dirty_metadata(handle, inode, this_bh);
- else
- EXT4_ERROR_INODE(inode,
- "circular indirect block detected at "
- "block %llu",
- (unsigned long long) this_bh->b_blocknr);
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
}
-}
-/**
- * ext4_free_branches - free an array of branches
- * @handle: JBD handle for this transaction
- * @inode: inode we are dealing with
- * @parent_bh: the buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: pointer immediately past the end of array
- * @depth: depth of the branches to free
- *
- * We are freeing all blocks refered from these branches (numbers are
- * stored as little-endian 32-bit) and updating @inode->i_blocks
- * appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
- struct buffer_head *parent_bh,
- __le32 *first, __le32 *last, int depth)
-{
- ext4_fsblk_t nr;
- __le32 *p;
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
- if (ext4_handle_is_aborted(handle))
- return;
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
- if (depth--) {
- struct buffer_head *bh;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- p = last;
- while (--p >= first) {
- nr = le32_to_cpu(*p);
- if (!nr)
- continue; /* A hole */
-
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
- EXT4_ERROR_INODE(inode,
- "invalid indirect mapped "
- "block %lu (level %d)",
- (unsigned long) nr, depth);
- break;
- }
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
- /* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(sb, ret);
+ goto out_dio;
+ }
- /*
- * A read failure? Report error and clear slot
- * (should be rare).
- */
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, nr,
- "Read failure");
- continue;
- }
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
- /* This zaps the entire block. Bottom up. */
- BUFFER_TRACE(bh, "free child branches");
- ext4_free_branches(handle, inode, bh,
- (__le32 *) bh->b_data,
- (__le32 *) bh->b_data + addr_per_block,
- depth);
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- /*
- * Everything below this this pointer has been
- * released. Now let this top-of-subtree go.
- *
- * We want the freeing of this indirect block to be
- * atomic in the journal with the updating of the
- * bitmap block which owns it. So make some room in
- * the journal.
- *
- * We zero the parent pointer *after* freeing its
- * pointee in the bitmaps, so if extend_transaction()
- * for some reason fails to put the bitmap changes and
- * the release into the same transaction, recovery
- * will merely complain about releasing a free block,
- * rather than leaking blocks.
- */
- if (ext4_handle_is_aborted(handle))
- return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- blocks_for_truncate(inode));
- }
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out_stop;
- /*
- * The forget flag here is critical because if
- * we are journaling (and not doing data
- * journaling), we have to make sure a revoke
- * record is written to prevent the journal
- * replay from overwriting the (former)
- * indirect block if it gets reallocated as a
- * data block. This must happen in the same
- * transaction where the data blocks are
- * actually freed.
- */
- ext4_free_blocks(handle, inode, 0, nr, 1,
- EXT4_FREE_BLOCKS_METADATA|
- EXT4_FREE_BLOCKS_FORGET);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- if (parent_bh) {
- /*
- * The block which we have just freed is
- * pointed to by an indirect block: journal it
- */
- BUFFER_TRACE(parent_bh, "get_write_access");
- if (!ext4_journal_get_write_access(handle,
- parent_bh)){
- *p = 0;
- BUFFER_TRACE(parent_bh,
- "call ext4_handle_dirty_metadata");
- ext4_handle_dirty_metadata(handle,
- inode,
- parent_bh);
- }
- }
- }
- } else {
- /* We have reached the bottom of the tree. */
- BUFFER_TRACE(parent_bh, "free data blocks");
- ext4_free_data(handle, inode, parent_bh, first, last);
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
}
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_free_hole_blocks(handle, inode, first_block,
+ stop_block);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
-int ext4_can_truncate(struct inode *inode)
+int ext4_inode_attach_jinode(struct inode *inode)
{
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
return 0;
- if (S_ISREG(inode->i_mode))
- return 1;
- if (S_ISDIR(inode->i_mode))
- return 1;
- if (S_ISLNK(inode->i_mode))
- return !ext4_inode_is_fast_symlink(inode);
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
+ }
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
return 0;
}
@@ -4430,7 +3679,7 @@ int ext4_can_truncate(struct inode *inode)
* transaction, and VFS/VM ensures that ext4_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -4453,18 +3702,19 @@ int ext4_can_truncate(struct inode *inode)
*/
void ext4_truncate(struct inode *inode)
{
- handle_t *handle;
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *i_data = ei->i_data;
- int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ unsigned int credits;
+ handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t offsets[4];
- Indirect chain[4];
- Indirect *partial;
- __le32 nr = 0;
- int n;
- ext4_lblk_t last_block;
- unsigned blocksize = inode->i_sb->s_blocksize;
+
+ /*
+ * There is a possibility that we're either freeing the inode
+ * or it's a completely new inode. In those cases we might not
+ * have i_mutex locked because it's not necessary.
+ */
+ if (!(inode->i_state & (I_NEW|I_FREEING)))
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
return;
@@ -4474,31 +3724,39 @@ void ext4_truncate(struct inode *inode)
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ext4_ext_truncate(inode);
- return;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
}
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- return; /* AKPM: return what? */
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
+ }
- last_block = (inode->i_size + blocksize-1)
- >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+ credits = ext4_blocks_for_truncate(inode);
- if (inode->i_size & (blocksize - 1))
- if (ext4_block_truncate_page(handle, mapping, inode->i_size))
- goto out_stop;
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ return;
+ }
- n = ext4_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
+ * We add the inode to the orphan list, so that if this
+ * truncate spans multiple transactions, and we crash, we will
+ * resume the truncate when the filesystem recovers. It also
+ * marks the inode dirty, to catch the new size.
*
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
@@ -4506,96 +3764,23 @@ void ext4_truncate(struct inode *inode)
if (ext4_orphan_add(handle, inode))
goto out_stop;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- /*
- * The orphan list entry will now protect us from any crash which
- * occurs before the truncate completes, so it is now safe to propagate
- * the new, shorter inode size (held for now in i_size) into the
- * on-disk inode. We do this via i_disksize, which is the value which
- * ext4 *really* writes onto the disk inode.
- */
- ei->i_disksize = inode->i_size;
-
- if (n == 1) { /* direct blocks */
- ext4_free_data(handle, inode, NULL, i_data+offsets[0],
- i_data + EXT4_NDIR_BLOCKS);
- goto do_indirects;
- }
-
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (not detached) */
- if (nr) {
- if (partial == chain) {
- /* Shared branch grows from the inode */
- ext4_free_branches(handle, inode, NULL,
- &nr, &nr+1, (chain+n-1) - partial);
- *partial->p = 0;
- /*
- * We mark the inode dirty prior to restart,
- * and prior to stop. No need for it here.
- */
- } else {
- /* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
- ext4_free_branches(handle, inode, partial->bh,
- partial->p,
- partial->p+1, (chain+n-1) - partial);
- }
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
- (__le32*)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees */
- switch (offsets[0]) {
- default:
- nr = i_data[EXT4_IND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
- i_data[EXT4_IND_BLOCK] = 0;
- }
- case EXT4_IND_BLOCK:
- nr = i_data[EXT4_DIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
- i_data[EXT4_DIND_BLOCK] = 0;
- }
- case EXT4_DIND_BLOCK:
- nr = i_data[EXT4_TIND_BLOCK];
- if (nr) {
- ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
- i_data[EXT4_TIND_BLOCK] = 0;
- }
- case EXT4_TIND_BLOCK:
- ;
- }
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(handle, inode);
+ else
+ ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
out_stop:
/*
- * If this was a simple ftruncate(), and the file will remain alive
+ * If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
* ext4_delete_inode(), and we allow that function to clean up the
@@ -4604,7 +3789,11 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+
+ trace_ext4_truncate_exit(inode);
}
/*
@@ -4634,18 +3823,15 @@ static int __ext4_get_inode_loc(struct inode *inode,
/*
* Figure out the offset within the block group inode table
*/
- inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- return -EIO;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -4677,7 +3863,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* Is the inode bitmap in cache? */
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -4713,16 +3899,16 @@ make_io:
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
table = ext4_inode_table(sb, gdp);
/* s_inode_readahead_blks is always a power of 2 */
- b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
if (table > b)
b = table;
- end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ end = b + ra_blks;
num = EXT4_INODES_PER_GROUP(sb);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
@@ -4736,9 +3922,10 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4762,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4824,6 +4013,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4833,6 +4035,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -4841,22 +4045,58 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
- iloc.bh = 0;
+ iloc.bh = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ ret = -EIO;
+ goto bad_inode;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ /* Precompute checksum seed for inode metadata */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 csum;
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = raw_inode->i_generation;
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+ sizeof(gen));
+ }
+
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ EXT4_ERROR_INODE(inode, "checksum invalid");
+ ret = -EIO;
+ goto bad_inode;
+ }
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- ei->i_state_flags = 0;
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4865,8 +4105,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if ((inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ ino != EXT4_BOOT_LOADER_INO) {
/* this inode is deleted */
ret = -ESTALE;
goto bad_inode;
@@ -4874,7 +4115,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* The only unlinked inodes we let through here have
* valid i_mode and are being read by the orphan
* recovery code: that's fine, we're about to complete
- * the process of deleting those. */
+ * the process of deleting those.
+ * OR it is the EXT4_BOOT_LOADER_INO which is
+ * not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4924,36 +4167,27 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- ret = -EIO;
- goto bad_inode;
- }
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
- } else
- ei->i_extra_isize = 0;
+ }
EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
ret = 0;
@@ -4963,17 +4197,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl);
ret = -EIO;
goto bad_inode;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode)))
- /* Validate extent which is part of inode */
- ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
- ret = ext4_check_inode_blockref(inode);
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
}
if (ret)
goto bad_inode;
@@ -5003,6 +4239,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else if (ino == EXT4_BOOT_LOADER_INO) {
+ make_bad_inode(inode);
} else {
ret = -EIO;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
@@ -5029,7 +4267,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5072,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
+ int need_datasync = 0, set_large_file = 0;
+ uid_t i_uid;
+ gid_t i_gid;
- /* For fields not not tracking in the in-memory inode,
+ spin_lock(&ei->i_raw_lock);
+
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if (!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -5112,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
- ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, NULL,
- EXT4_SB(sb)->s_sbh);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -5156,25 +4389,43 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else
+ } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
+ }
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
+ ext4_inode_csum_set(inode, raw_inode, ei);
+
+ spin_unlock(&ei->i_raw_lock);
+
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
-
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -5186,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -5212,15 +4462,15 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -5230,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -5240,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
@@ -5253,6 +4512,48 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
+
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
+}
+
+/*
* ext4_setattr()
*
* Called from notify_change.
@@ -5289,14 +4590,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5316,60 +4618,82 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
- }
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
- handle_t *handle;
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- if (ext4_handle_valid(handle)) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error) {
- /* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
+ if (error)
goto err_out;
- }
- ext4_orphan_del(handle, inode);
- orphan = 0;
- ext4_journal_stop(handle);
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
goto err_out;
}
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ /*
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
+ */
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
+
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
- /* ext4_truncate will clear the flag */
- if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
- ext4_truncate(inode);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, inode->i_size);
}
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- rc = vmtruncate(inode, attr->ia_size);
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
@@ -5384,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -5397,12 +4721,21 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
/*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
* We can't update i_blocks if the block allocation is delayed
* otherwise in the case of system crash before the real block
* allocation is done, we will have i_blocks inconsistent with
@@ -5412,42 +4745,18 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* will return the blocks that include the delayed allocation
* blocks for this file.
*/
- delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
return 0;
}
-static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
- int chunk)
-{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, it need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
- * 2 dindirect blocks
- * 1 tindirect block
- */
- indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
- return indirects + 3;
- }
- /*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
- */
- indirects = nrblocks * 2 + 1;
- return indirects;
-}
-
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -5456,12 +4765,13 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiuguous, with flexbg,
+ * different block groups too. If they are contiguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5469,14 +4779,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -5484,12 +4790,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -5506,7 +4807,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
}
/*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
* the modification of a single pages into a single transaction,
* which may include multiple chunks of block allocations.
*
@@ -5520,7 +4821,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
@@ -5551,7 +4852,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (test_opt(inode->i_sb, I_VERSION))
+ if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
@@ -5632,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -5700,11 +4993,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
@@ -5766,9 +5059,23 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
/*
* OK, there are no updates running now, and all cached data is
@@ -5780,15 +5087,18 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
- else
+ else {
+ jbd2_journal_flush(journal);
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -5810,66 +5120,85 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret = -EINVAL;
- void *fsdata;
+ int ret;
struct file *file = vma->vm_file;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
- /*
- * Get i_alloc_sem to stop truncates messing with the inode. We cannot
- * get i_mutex because we are already holding mmap_sem.
- */
- down_read(&inode->i_alloc_sem);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
+ }
+
+ lock_page(page);
size = i_size_read(inode);
- if (page->mapping != mapping || size <= page_offset(page)
- || !PageUptodate(page)) {
- /* page got truncated from under us? */
- goto out_unlock;
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
}
- ret = 0;
- if (PageMappedToDisk(page))
- goto out_unlock;
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
-
- lock_page(page);
/*
- * return if we have all the buffers mapped. This avoid
- * the need to call write_begin/write_end which does a
- * journal_start/journal_stop which can block and take
- * long time
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped)) {
- unlock_page(page);
- goto out_unlock;
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
+ /* Wait so that we don't change page under IO */
+ wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
}
}
unlock_page(page);
- /*
- * OK, we need to fill the hole... Do write_begin write_end
- * to do block allocation/reservation.We are not holding
- * inode.i__mutex here. That allow * parallel write_begin,
- * write_end call. lock_page prevent this from happening
- * on the same page though
- */
- ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- len, len, page, fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = 0;
-out_unlock:
- if (ret)
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
- up_read(&inode->i_alloc_sem);
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}