aboutsummaryrefslogtreecommitdiff
path: root/fs/ext3/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext3/inode.c')
-rw-r--r--fs/ext3/inode.c1127
1 files changed, 696 insertions, 431 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670a27e..2c6ccc49ba2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,24 +22,18 @@
* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/ext3_jbd.h>
-#include <linux/jbd.h>
#include <linux/highuid.h>
-#include <linux/pagemap.h>
#include <linux/quotaops.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
+#include <linux/namei.h>
+#include <linux/aio.h>
+#include "ext3.h"
#include "xattr.h"
#include "acl.h"
static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
/*
* Test whether an inode is a fast symlink.
@@ -68,6 +62,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
might_sleep();
+ trace_ext3_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -95,7 +90,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
BUFFER_TRACE(bh, "call ext3_journal_revoke");
err = ext3_journal_revoke(handle, blocknr, bh);
if (err)
- ext3_abort(inode->i_sb, __FUNCTION__,
+ ext3_abort(inode->i_sb, __func__,
"error %d when attempting revoke", err);
BUFFER_TRACE(bh, "exit");
return err;
@@ -170,22 +165,78 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
- * Called at the last iput() if i_nlink is zero.
+ * Called at inode eviction from icache
*/
-void ext3_delete_inode (struct inode * inode)
+void ext3_evict_inode (struct inode *inode)
{
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_block_alloc_info *rsv;
handle_t *handle;
+ int want_delete = 0;
+
+ trace_ext3_evict_inode(inode);
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ dquot_initialize(inode);
+ want_delete = 1;
+ }
+
+ /*
+ * When journalling data dirty buffers are tracked only in the journal.
+ * So although mm thinks everything is clean and ready for reaping the
+ * inode might still have some pages to write in the running
+ * transaction or waiting to be checkpointed. Thus calling
+ * journal_invalidatepage() (via truncate_inode_pages()) to discard
+ * these buffers can cause data loss. Also even if we did not discard
+ * these buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to read
+ * them before the transaction is checkpointed. So be careful and
+ * force everything to disk here... We use ei->i_datasync_tid to
+ * store the newest transaction containing inode's data.
+ *
+ * Note that directories do not have this problem because they don't
+ * use page cache.
+ *
+ * The s_journal check handles the case when ext3_get_journal() fails
+ * and puts the journal inode.
+ */
+ if (inode->i_nlink && ext3_should_journal_data(inode) &&
+ EXT3_SB(inode->i_sb)->s_journal &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+ inode->i_ino != EXT3_JOURNAL_INO) {
+ tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+ log_start_commit(journal, commit_tid);
+ log_wait_commit(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages_final(&inode->i_data);
- truncate_inode_pages(&inode->i_data, 0);
+ ext3_discard_reservation(inode);
+ rsv = ei->i_block_alloc_info;
+ ei->i_block_alloc_info = NULL;
+ if (unlikely(rsv))
+ kfree(rsv);
- if (is_bad_inode(inode))
+ if (!want_delete)
goto no_delete;
handle = start_transaction(inode);
@@ -205,15 +256,13 @@ void ext3_delete_inode (struct inode * inode)
if (inode->i_blocks)
ext3_truncate(inode);
/*
- * Kill off the orphan record which ext3_truncate created.
- * AKPM: I think this can be inside the above `if'.
- * Note that ext3_orphan_del() has to be able to cope with the
- * deletion of a non-existent orphan - this is because we don't
- * know if ext3_truncate() actually created an orphan record.
- * (Well, we could do this if we need to, but heck - it works)
+ * Kill off the orphan record created when the inode lost the last
+ * link. Note that ext3_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - ext3_truncate() could
+ * have removed the record.
*/
ext3_orphan_del(handle, inode);
- EXT3_I(inode)->i_dtime = get_seconds();
+ ei->i_dtime = get_seconds();
/*
* One subtle ordering requirement: if anything has gone wrong
@@ -222,15 +271,22 @@ void ext3_delete_inode (struct inode * inode)
* having errors), but we can't free the inode if the mark_dirty
* fails.
*/
- if (ext3_mark_inode_dirty(handle, inode))
- /* If that failed, just do the required in-core inode clear. */
+ if (ext3_mark_inode_dirty(handle, inode)) {
+ /* If that failed, just dquot_drop() and be done with that */
+ dquot_drop(inode);
+ clear_inode(inode);
+ } else {
+ ext3_xattr_delete_inode(handle, inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
clear_inode(inode);
- else
ext3_free_inode(handle, inode);
+ }
ext3_journal_stop(handle);
return;
no_delete:
- clear_inode(inode); /* We must guarantee clearing of inode... */
+ clear_inode(inode);
+ dquot_drop(inode);
}
typedef struct {
@@ -392,7 +448,7 @@ no_block:
* @inode: owner
* @ind: descriptor of indirect block.
*
- * This function returns the prefered place for block allocation.
+ * This function returns the preferred place for block allocation.
* It is used when heuristic for sequential allocation fails.
* Rules are:
* + if there is a block to the left of our position - allocate near it.
@@ -436,12 +492,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
}
/**
- * ext3_find_goal - find a prefered place for allocation.
+ * ext3_find_goal - find a preferred place for allocation.
* @inode: owner
* @block: block we want
* @partial: pointer to the last triple within a chain
*
- * Normally this function find the prefered place for block allocation,
+ * Normally this function find the preferred place for block allocation,
* returns it.
*/
@@ -465,7 +521,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
}
/**
- * ext3_blks_to_allocate: Look up the block map and count the number
+ * ext3_blks_to_allocate - Look up the block map and count the number
* of direct blocks need to be allocated for the given branch.
*
* @branch: chain of indirect blocks
@@ -503,14 +559,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
}
/**
- * ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ * ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @goal: preferred place for allocation
* @indirect_blks: the number of blocks need to allocate for indirect
* blocks
- *
+ * @blks: number of blocks need to allocated for direct blocks
* @new_blocks: on return it will store the new block numbers for
* the indirect blocks(if needed) and the first direct block,
- * @blks: on return it will store the total number of allocated
- * direct blocks
+ * @err: here we store the error value
+ *
+ * return the number of direct blocks allocated
*/
static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -565,9 +625,11 @@ failed_out:
/**
* ext3_alloc_branch - allocate and set up a chain of blocks.
+ * @handle: handle for this transaction
* @inode: owner
* @indirect_blks: number of allocated indirect blocks
* @blks: number of allocated direct blocks
+ * @goal: preferred place for allocation
* @offsets: offsets (in the blocks) to store the pointers to next.
* @branch: place to store the chain in.
*
@@ -616,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -657,7 +723,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext3_free_blocks(handle, inode, new_blocks[i], 1);
ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -667,10 +733,9 @@ failed:
/**
* ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
* @inode: owner
* @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext3_alloc_branch)
* @where: location of missing link
* @num: number of indirect blocks we are adding
* @blks: number of direct blocks we are adding
@@ -686,8 +751,10 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
int err = 0;
struct ext3_block_alloc_info *block_i;
ext3_fsblk_t current_block;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct timespec now;
- block_i = EXT3_I(inode)->i_block_alloc_info;
+ block_i = ei->i_block_alloc_info;
/*
* If we're splicing into a [td]indirect block (as opposed to the
* inode) then we need to get write access to the [td]indirect block
@@ -725,9 +792,13 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
}
/* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
+ now = CURRENT_TIME_SEC;
+ if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
+ inode->i_ctime = now;
+ ext3_mark_inode_dirty(handle, inode);
+ }
+ /* ext3_mark_inode_dirty already updated i_sync_tid */
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
/* had we spliced it onto indirect block? */
if (where->bh) {
@@ -786,7 +857,7 @@ err_out:
int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
sector_t iblock, unsigned long maxblocks,
struct buffer_head *bh_result,
- int create, int extend_disksize)
+ int create)
{
int err = -EIO;
int offsets[4];
@@ -801,6 +872,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
ext3_fsblk_t first_block = 0;
+ trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
J_ASSERT(handle != NULL || create == 0);
depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
@@ -818,7 +890,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
while (count < maxblocks && count <= blocks_to_boundary) {
ext3_fsblk_t blk;
- if (!verify_chain(chain, partial)) {
+ if (!verify_chain(chain, chain + depth - 1)) {
/*
* Indirect block might be removed by
* truncate while we were reading it.
@@ -845,6 +917,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (!create || err == -EIO)
goto cleanup;
+ /*
+ * Block out ext3_truncate while we alter the tree
+ */
mutex_lock(&ei->truncate_mutex);
/*
@@ -893,9 +968,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
*/
count = ext3_blks_to_allocate(partial, indirect_blks,
maxblocks, blocks_to_boundary);
- /*
- * Block out ext3_truncate while we alter the tree
- */
err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
offsets + (partial - chain), partial);
@@ -909,13 +981,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (!err)
err = ext3_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- /*
- * i_disksize growing is protected by truncate_mutex. Don't forget to
- * protect it if you're about to implement concurrent
- * ext3_get_block() -bzzz
- */
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
@@ -936,6 +1001,9 @@ cleanup:
}
BUFFER_TRACE(bh_result, "returned");
out:
+ trace_ext3_get_blocks_exit(inode, iblock,
+ depth ? le32_to_cpu(chain[depth-1].key) : 0,
+ count, err);
return err;
}
@@ -961,7 +1029,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
if (max_blocks > DIO_MAX_BLOCKS)
max_blocks = DIO_MAX_BLOCKS;
handle = ext3_journal_start(inode, DIO_CREDITS +
- 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb));
+ EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -970,7 +1038,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
}
ret = ext3_get_blocks_handle(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -981,6 +1049,13 @@ out:
return ret;
}
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext3_get_block);
+}
+
/*
* `handle' can be NULL if create is zero
*/
@@ -996,22 +1071,21 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext3_get_blocks_handle(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create);
/*
* ext3_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
*/
if (err > 0) {
- if (err > 1)
- WARN_ON(1);
+ WARN_ON(err > 1);
err = 0;
}
*errp = err;
if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
@@ -1059,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
bh = ext3_getblk(handle, inode, block, create, err);
if (!bh)
return bh;
- if (buffer_uptodate(bh))
+ if (bh_uptodate_or_lock(bh))
return bh;
- ll_rw_block(READ_META, 1, &bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1130,9 +1206,45 @@ static int walk_page_buffers( handle_t *handle,
static int do_journal_get_write_access(handle_t *handle,
struct buffer_head *bh)
{
+ int dirty = buffer_dirty(bh);
+ int ret;
+
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- return ext3_journal_get_write_access(handle, bh);
+ /*
+ * __block_prepare_write() could have dirtied some buffers. Clean
+ * the dirty bit as jbd2_journal_get_write_access() could complain
+ * otherwise about fs integrity issues. Setting of the dirty bit
+ * by __block_prepare_write() isn't a real problem here as we clear
+ * the bit before releasing a page lock and thus writeback cannot
+ * ever write the buffer.
+ */
+ if (dirty)
+ clear_buffer_dirty(bh);
+ ret = ext3_journal_get_write_access(handle, bh);
+ if (!ret && dirty)
+ ret = ext3_journal_dirty_metadata(handle, bh);
+ return ret;
+}
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext3_truncate(inode);
+}
+
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+ ext3_block_truncate_page(inode, inode->i_size);
+ ext3_truncate(inode);
}
static int ext3_write_begin(struct file *file, struct address_space *mapping,
@@ -1140,19 +1252,24 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+ int ret;
handle_t *handle;
int retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
+ /* Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason */
+ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+
+ trace_ext3_write_begin(inode, pos, len, flags);
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -1164,8 +1281,7 @@ retry:
ret = PTR_ERR(handle);
goto out;
}
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext3_get_block);
+ ret = __block_write_begin(page, pos, len, ext3_get_block);
if (ret)
goto write_begin_failed;
@@ -1175,9 +1291,22 @@ retry:
}
write_begin_failed:
if (ret) {
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before truncate
+ * finishes. Do this only if ext3_can_truncate() agrees so
+ * that orphan processing code is happy.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1190,11 +1319,23 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
int err = journal_dirty_data(handle, bh);
if (err)
- ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
+ ext3_journal_abort_handle(__func__, __func__,
bh, handle, err);
return err;
}
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * Write could have mapped the buffer but it didn't copy the data in
+ * yet. So avoid filing such buffer into a transaction.
+ */
+ if (buffer_mapped(bh) && buffer_uptodate(bh))
+ return ext3_journal_dirty_data(handle, bh);
+ return 0;
+}
+
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1205,26 +1346,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
- * after block_write_end.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
*/
-static int ext3_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
+ /* What matters to us is i_disksize. We don't write i_size anywhere */
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ if (pos + copied > EXT3_I(inode)->i_disksize) {
+ EXT3_I(inode)->i_disksize = pos + copied;
mark_inode_dirty(inode);
}
-
- return copied;
}
/*
@@ -1244,34 +1379,30 @@ static int ext3_ordered_write_end(struct file *file,
unsigned from, to;
int ret = 0, ret2;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ trace_ext3_ordered_write_end(inode, pos, len, copied);
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + copied;
ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext3_journal_dirty_data);
-
- if (ret == 0) {
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
- loff_t new_i_size;
+ from, to, NULL, journal_dirty_data_fn);
- new_i_size = pos + copied;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
- copied = ext3_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
- }
+ if (ret == 0)
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
@@ -1282,24 +1413,23 @@ static int ext3_writeback_write_end(struct file *file,
{
handle_t *handle = ext3_journal_current_handle();
struct inode *inode = file->f_mapping->host;
- int ret = 0, ret2;
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
-
- copied = ext3_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- if (copied < 0)
- ret = copied;
+ int ret;
- ret2 = ext3_journal_stop(handle);
- if (!ret)
- ret = ret2;
+ trace_ext3_writeback_write_end(inode, pos, len, copied);
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
+ ret = ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
@@ -1310,28 +1440,39 @@ static int ext3_journalled_write_end(struct file *file,
{
handle_t *handle = ext3_journal_current_handle();
struct inode *inode = mapping->host;
+ struct ext3_inode_info *ei = EXT3_I(inode);
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
+ trace_ext3_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ page_zero_new_buffers(page, from + copied, to);
+ to = from + copied;
}
ret = walk_page_buffers(handle, page_buffers(page), from,
to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
- if (inode->i_size > EXT3_I(inode)->i_disksize) {
- EXT3_I(inode)->i_disksize = inode->i_size;
+
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
+ ext3_orphan_add(handle, inode);
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+ if (inode->i_size > ei->i_disksize) {
+ ei->i_disksize = inode->i_size;
ret2 = ext3_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
@@ -1343,6 +1484,8 @@ static int ext3_journalled_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ ext3_truncate_failed_write(inode);
return ret ? ret : copied;
}
@@ -1366,7 +1509,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
- if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
+ if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -1385,7 +1528,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+ ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
journal = EXT3_JOURNAL(inode);
journal_lock_updates(journal);
err = journal_flush(journal);
@@ -1410,64 +1553,23 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
{
- if (buffer_mapped(bh))
- return ext3_journal_dirty_data(handle, bh);
- return 0;
+ return !buffer_mapped(bh);
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data. This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
* refers to old data.
*
* In all journalling modes block_write_full_page() will start the I/O.
*
- * Problem:
- *
- * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext3_writepage()
- *
- * Similar for:
- *
- * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block(). We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
- *
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
- *
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
- *
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
- *
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
struct writeback_control *wbc)
@@ -1479,6 +1581,13 @@ static int ext3_ordered_writepage(struct page *page,
int err;
J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
/*
* We give up here if we're reentered, because it might be for a
@@ -1487,6 +1596,20 @@ static int ext3_ordered_writepage(struct page *page,
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_ordered_writepage(page);
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ page_bufs = page_buffers(page);
+ } else {
+ page_bufs = page_buffers(page);
+ if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, buffer_unmapped)) {
+ /* Provide NULL get_block() to catch bugs if buffers
+ * weren't really mapped */
+ return block_write_full_page(page, NULL, wbc);
+ }
+ }
handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
@@ -1494,11 +1617,6 @@ static int ext3_ordered_writepage(struct page *page,
goto out_fail;
}
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- }
- page_bufs = page_buffers(page);
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bget_one);
@@ -1516,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ if (ret == 0)
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
err = ext3_journal_stop(handle);
@@ -1543,19 +1658,35 @@ static int ext3_writeback_writepage(struct page *page,
int ret = 0;
int err;
+ J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+
if (ext3_journal_current_handle())
goto out_fail;
+ trace_ext3_writeback_writepage(page);
+ if (page_has_buffers(page)) {
+ if (!walk_page_buffers(NULL, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+ /* Provide NULL get_block() to catch bugs if buffers
+ * weren't really mapped */
+ return block_write_full_page(page, NULL, wbc);
+ }
+ }
+
handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_fail;
}
- if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
- ret = nobh_writepage(page, ext3_get_block, wbc);
- else
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, ext3_get_block, wbc);
err = ext3_journal_stop(handle);
if (!ret)
@@ -1576,23 +1707,33 @@ static int ext3_journalled_writepage(struct page *page,
int ret = 0;
int err;
- if (ext3_journal_current_handle())
- goto no_write;
-
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
+ J_ASSERT(PageLocked(page));
+ /*
+ * We don't want to warn for emergency remount. The condition is
+ * ordered to avoid dereferencing inode->i_sb in non-error case to
+ * avoid slow-downs.
+ */
+ WARN_ON_ONCE(IS_RDONLY(inode) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
+ trace_ext3_journalled_writepage(page);
if (!page_has_buffers(page) || PageChecked(page)) {
+ if (ext3_journal_current_handle())
+ goto no_write;
+
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext3_get_block);
+ ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+ ext3_get_block);
if (ret != 0) {
ext3_journal_stop(handle);
goto out_unlock;
@@ -1604,19 +1745,22 @@ static int ext3_journalled_writepage(struct page *page,
PAGE_CACHE_SIZE, NULL, write_end_fn);
if (ret == 0)
ret = err;
- EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+ ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+ atomic_set(&EXT3_I(inode)->i_datasync_tid,
+ handle->h_transaction->t_tid);
unlock_page(page);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
} else {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * It is a page full of checkpoint-mode buffers. Go and write
+ * them. They should have been already mapped when they went
+ * to the journal so provide NULL get_block function to catch
+ * errors.
*/
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, NULL, wbc);
}
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
out:
return ret;
@@ -1629,6 +1773,7 @@ out_unlock:
static int ext3_readpage(struct file *file, struct page *page)
{
+ trace_ext3_readpage(page);
return mpage_readpage(page, ext3_get_block);
}
@@ -1639,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_invalidatepage(page, offset, length);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ trace_ext3_releasepage(page);
WARN_ON(PageChecked(page));
if (!page_has_buffers(page))
return 0;
@@ -1672,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1681,7 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
+ int retries = 0;
+
+ trace_ext3_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1704,9 +1855,21 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
}
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext3_get_block, NULL);
+retry:
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
+ /*
+ * In case of error extending write may have instantiated a few
+ * blocks outside i_size. Trim these off again.
+ */
+ if (unlikely((rw & WRITE) && ret < 0)) {
+ loff_t isize = i_size_read(inode);
+ loff_t end = offset + count;
+
+ if (end > isize)
+ ext3_truncate_failed_direct_write(inode);
+ }
+ if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -1715,9 +1878,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
handle = ext3_journal_start(inode, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
- * but cannot extend i_size. Bail out and pretend
- * the write failed... */
+ * but cannot extend i_size. Truncate allocated blocks
+ * and pretend the write failed... */
+ ext3_truncate_failed_direct_write(inode);
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
goto out;
}
if (inode->i_nlink)
@@ -1742,6 +1908,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
ret = err;
}
out:
+ trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -1765,44 +1932,48 @@ static int ext3_journalled_set_page_dirty(struct page *page)
}
static const struct address_space_operations ext3_ordered_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_ordered_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_ordered_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_ordered_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_ordered_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_writeback_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_writeback_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_writeback_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_writeback_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_writeback_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_journalled_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_journalled_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_journalled_write_end,
- .set_page_dirty = ext3_journalled_set_page_dirty,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_journalled_writepage,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_journalled_write_end,
+ .set_page_dirty = ext3_journalled_set_page_dirty,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext3_set_aops(struct inode *inode)
@@ -1821,31 +1992,27 @@ void ext3_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
- struct address_space *mapping, loff_t from)
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
{
ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, length, pos;
- struct inode *inode = mapping->host;
+ struct page *page;
+ handle_t *handle = NULL;
struct buffer_head *bh;
int err = 0;
+ /* Truncated on block boundary - nothing to do */
blocksize = inode->i_sb->s_blocksize;
+ if ((from & (blocksize - 1)) == 0)
+ return 0;
+
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page)
+ return -ENOMEM;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
- /*
- * For "nobh" option, we can only work if we don't need to
- * read-in the page - otherwise we create buffers to do the IO.
- */
- if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
- ext3_should_writeback_data(inode) && PageUptodate(page)) {
- zero_user(page, offset, length);
- set_page_dirty(page);
- goto unlock;
- }
-
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -1878,20 +2045,30 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err)
+ goto unlock;
+ }
+
+ /* data=writeback mode doesn't need transaction to zero-out data */
+ if (!ext3_should_writeback_data(inode)) {
+ /* We journal at most one block */
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ err = PTR_ERR(handle);
goto unlock;
+ }
}
if (ext3_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = ext3_journal_get_write_access(handle, bh);
if (err)
- goto unlock;
+ goto stop;
}
zero_user(page, offset, length);
@@ -1905,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
err = ext3_journal_dirty_data(handle, bh);
mark_buffer_dirty(bh);
}
+stop:
+ if (handle)
+ ext3_journal_stop(handle);
unlock:
unlock_page(page);
@@ -1937,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
*
* When we do truncate() we may have to clean the ends of several
* indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
+ * partially truncated if some data below the new i_size is referred
* from it (and it is on the path to the first completely truncated
* data block, indeed). We have to free the top of that path along
* with everything to the right of the path. Since no allocation
@@ -1967,7 +2147,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
int k, err;
*top = 0;
- /* Make k index the deepest non-null offest + 1 */
+ /* Make k index the deepest non-null offset + 1 */
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
partial = ext3_get_branch(inode, k, offsets, chain, &err);
@@ -2024,13 +2204,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, bh);
+ if (ext3_journal_dirty_metadata(handle, bh))
+ return;
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ext3_journal_get_write_access(handle, bh);
+ if (ext3_journal_get_write_access(handle, bh))
+ return;
}
}
@@ -2064,7 +2246,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
* @first: array of block numbers
* @last: points immediately past the end of array
*
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
* little-endian 32-bit) and updating @inode->i_blocks appropriately.
*
* We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -2125,7 +2307,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext3_journal_dirty_metadata(handle, this_bh);
+ else
+ ext3_error(inode->i_sb, "ext3_free_data",
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)this_bh->b_blocknr);
}
}
@@ -2138,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
* @last: pointer immediately past the end of array
* @depth: depth of the branches to free
*
- * We are freeing all blocks refered from these branches (numbers are
+ * We are freeing all blocks referred from these branches (numbers are
* stored as little-endian 32-bit) and updating @inode->i_blocks
* appropriately.
*/
@@ -2183,27 +2379,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
depth);
/*
- * We've probably journalled the indirect block several
- * times during the truncate. But it's no longer
- * needed and we now drop it from the transaction via
- * journal_revoke().
- *
- * That's easy if it's exclusively part of this
- * transaction. But if it's part of the committing
- * transaction then journal_forget() will simply
- * brelse() it. That means that if the underlying
- * block is reallocated in ext3_get_block(),
- * unmap_underlying_metadata() will find this block
- * and will try to get rid of it. damn, damn.
- *
- * If this block has already been committed to the
- * journal, a revoke record will be written. And
- * revoke records must be emitted *before* clearing
- * this block's bit in the bitmaps.
- */
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-
- /*
* Everything below this this pointer has been
* released. Now let this top-of-subtree go.
*
@@ -2223,9 +2398,34 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
+ /*
+ * We've probably journalled the indirect block several
+ * times during the truncate. But it's no longer
+ * needed and we now drop it from the transaction via
+ * journal_revoke().
+ *
+ * That's easy if it's exclusively part of this
+ * transaction. But if it's part of the committing
+ * transaction then journal_forget() will simply
+ * brelse() it. That means that if the underlying
+ * block is reallocated in ext3_get_block(),
+ * unmap_underlying_metadata() will find this block
+ * and will try to get rid of it. damn, damn. Thus
+ * we don't allow a block to be reallocated until
+ * a transaction freeing it has fully committed.
+ *
+ * We also have to make sure journal replay after a
+ * crash does not overwrite non-journaled data blocks
+ * with old metadata when the block got reallocated for
+ * data. Thus we have to store a revoke record for a
+ * block in the same transaction in which we free the
+ * block.
+ */
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
ext3_free_blocks(handle, inode, nr, 1);
if (parent_bh) {
@@ -2251,6 +2451,17 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext3_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext3_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext3_truncate()
*
@@ -2258,7 +2469,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
* transaction, and VFS/VM ensures that ext3_truncate() cannot run
* simultaneously on behalf of the same inode.
*
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
* is one core, guiding principle: the file's tree must always be consistent on
* disk. We must be able to restart the truncate after a crash.
*
@@ -2285,7 +2496,6 @@ void ext3_truncate(struct inode *inode)
struct ext3_inode_info *ei = EXT3_I(inode);
__le32 *i_data = ei->i_data;
int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
int offsets[4];
Indirect chain[4];
Indirect *partial;
@@ -2293,47 +2503,21 @@ void ext3_truncate(struct inode *inode)
int n;
long last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext3_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
+ trace_ext3_truncate_enter(inode);
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
+ if (!ext3_can_truncate(inode))
+ goto out_notrans;
+
+ if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+ ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
- return; /* AKPM: return what? */
- }
+ if (IS_ERR(handle))
+ goto out_notrans;
last_block = (inode->i_size + blocksize-1)
>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-
- if (page)
- ext3_block_truncate_page(handle, page, mapping, inode->i_size);
-
n = ext3_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
goto out_stop; /* error */
@@ -2385,7 +2569,6 @@ void ext3_truncate(struct inode *inode)
*/
} else {
/* Shared branch grows from an indirect block */
- BUFFER_TRACE(partial->bh, "get_write_access");
ext3_free_branches(handle, inode, partial->bh,
partial->p,
partial->p+1, (chain+n-1) - partial);
@@ -2442,23 +2625,32 @@ out_stop:
* If this was a simple ftruncate(), and the file will remain alive
* then we need to clear up the orphan record which we created above.
* However, if this was a real unlink then we were called by
- * ext3_delete_inode(), and we allow that function to clean up the
+ * ext3_evict_inode(), and we allow that function to clean up the
* orphan info for us.
*/
if (inode->i_nlink)
ext3_orphan_del(handle, inode);
ext3_journal_stop(handle);
+ trace_ext3_truncate_exit(inode);
+ return;
+out_notrans:
+ /*
+ * Delete the inode from orphan list so that it doesn't stay there
+ * forever and trigger assertion on umount.
+ */
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
+ trace_ext3_truncate_exit(inode);
}
static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
unsigned long ino, struct ext3_iloc *iloc)
{
- unsigned long desc, group_desc, block_group;
+ unsigned long block_group;
unsigned long offset;
ext3_fsblk_t block;
- struct buffer_head *bh;
- struct ext3_group_desc * gdp;
+ struct ext3_group_desc *gdp;
if (!ext3_valid_inum(sb, ino)) {
/*
@@ -2470,27 +2662,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
}
block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
- if (block_group >= EXT3_SB(sb)->s_groups_count) {
- ext3_error(sb,"ext3_get_inode_block","group >= groups count");
- return 0;
- }
- smp_rmb();
- group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
- desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
- bh = EXT3_SB(sb)->s_group_desc[group_desc];
- if (!bh) {
- ext3_error (sb, "ext3_get_inode_block",
- "Descriptor not loaded");
+ gdp = ext3_get_group_desc(sb, block_group, NULL);
+ if (!gdp)
return 0;
- }
-
- gdp = (struct ext3_group_desc *)bh->b_data;
/*
* Figure out the offset within the block group inode table
*/
offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
EXT3_INODE_SIZE(sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
+ block = le32_to_cpu(gdp->bg_inode_table) +
(offset >> EXT3_BLOCK_SIZE_BITS(sb));
iloc->block_group = block_group;
@@ -2515,15 +2695,25 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
"inode=%lu, block="E3FSBLK,
inode->i_ino, block);
- return -EIO;
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
@@ -2559,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -2593,9 +2783,10 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
+ trace_ext3_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ_META, bh);
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext3_error(inode->i_sb, "ext3_get_inode_loc",
@@ -2615,7 +2806,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext3_get_inode_loc(inode, iloc,
- !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+ !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
}
void ext3_set_inode_flags(struct inode *inode)
@@ -2661,8 +2852,12 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
struct ext3_inode_info *ei;
struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+ transaction_t *transaction;
long ret;
int block;
+ uid_t i_uid;
+ gid_t i_gid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -2671,10 +2866,6 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT3_I(inode);
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- ei->i_acl = EXT3_ACL_NOT_CACHED;
- ei->i_default_acl = EXT3_ACL_NOT_CACHED;
-#endif
ei->i_block_alloc_info = NULL;
ret = __ext3_get_inode_loc(inode, &iloc, 0);
@@ -2683,20 +2874,22 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if(!(test_opt (inode->i_sb, NO_UID32))) {
- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ i_uid_write(inode, i_uid);
+ i_gid_write(inode, i_gid);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -2742,6 +2935,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ atomic_set(&ei->i_sync_tid, tid);
+ atomic_set(&ei->i_datasync_tid, tid);
+ }
+
if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
/*
@@ -2765,7 +2982,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
EXT3_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
- ei->i_state |= EXT3_STATE_XATTR;
+ ext3_set_inode_state(inode, EXT3_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -2778,9 +2995,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext3_dir_inode_operations;
inode->i_fop = &ext3_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext3_inode_is_fast_symlink(inode))
+ if (ext3_inode_is_fast_symlink(inode)) {
inode->i_op = &ext3_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
}
@@ -2818,40 +3037,54 @@ static int ext3_do_update_inode(handle_t *handle,
struct ext3_inode_info *ei = EXT3_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
+ __le32 disksize;
+ uid_t i_uid;
+ gid_t i_gid;
+
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT3_STATE_NEW)
+ if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
ext3_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if(!ei->i_dtime) {
raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(inode->i_uid));
+ cpu_to_le16(high_16_bits(i_uid));
raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(inode->i_gid));
+ cpu_to_le16(high_16_bits(i_gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ cpu_to_le16(fs_high2lowuid(i_uid));
raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ cpu_to_le16(fs_high2lowgid(i_gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+ disksize = cpu_to_le32(ei->i_disksize);
+ if (disksize != raw_inode->i_size) {
+ need_datasync = 1;
+ raw_inode->i_size = disksize;
+ }
raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -2867,8 +3100,11 @@ static int ext3_do_update_inode(handle_t *handle,
if (!S_ISREG(inode->i_mode)) {
raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
} else {
- raw_inode->i_size_high =
- cpu_to_le32(ei->i_disksize >> 32);
+ disksize = cpu_to_le32(ei->i_disksize >> 32);
+ if (disksize != raw_inode->i_size_high) {
+ raw_inode->i_size_high = disksize;
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -2878,17 +3114,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
- sb->s_dirt = 1;
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2911,11 +3150,15 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT3_STATE_NEW;
+ ext3_clear_inode_state(inode, EXT3_STATE_NEW);
+ atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+ if (need_datasync)
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
out_brelse:
brelse (bh);
ext3_std_error(inode->i_sb, err);
@@ -2927,21 +3170,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -2953,13 +3195,13 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (ext3_journal_current_handle()) {
@@ -2968,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, int wait)
return -EIO;
}
- if (!wait)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext3_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
return ext3_force_commit(inode->i_sb);
@@ -3001,19 +3248,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext3_journal_stop(handle);
return error;
@@ -3028,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
ext3_journal_stop(handle);
}
+ if (attr->ia_valid & ATTR_SIZE)
+ inode_dio_wait(inode);
+
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
handle_t *handle;
@@ -3039,23 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
}
error = ext3_orphan_add(handle, inode);
+ if (error) {
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
EXT3_I(inode)->i_disksize = attr->ia_size;
- rc = ext3_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+ error = ext3_mark_inode_dirty(handle, inode);
ext3_journal_stop(handle);
+ if (error) {
+ /* Some hard fs error must have happened. Bail out. */
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ rc = ext3_block_truncate_page(inode, attr->ia_size);
+ if (rc) {
+ /* Cleanup orphan list and exit */
+ handle = ext3_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext3_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext3_orphan_del(handle, inode);
+ ext3_journal_stop(handle);
+ goto err_out;
+ }
}
- rc = inode_setattr(inode, attr);
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ ext3_truncate(inode);
+ }
- /* If inode_setattr's call to ext3_truncate failed to get a
- * transaction handle at all, we need to clean up the in-core
- * orphan list manually. */
- if (inode->i_nlink)
- ext3_orphan_del(NULL, inode);
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
- if (!rc && (ia_valid & ATTR_MODE))
- rc = ext3_acl_chmod(inode);
+ if (ia_valid & ATTR_MODE)
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext3_std_error(inode->i_sb, error);
@@ -3101,12 +3373,12 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
if (ext3_should_journal_data(inode))
ret = 3 * (bpp + indirects) + 2;
else
- ret = 2 * (bpp + indirects) + 2;
+ ret = 2 * (bpp + indirects) + indirects + 2;
#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during DQUOT_INIT so
+ /* We know that structure was already allocated during dquot_initialize so
* we will be updating only the data blocks + inodes */
- ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
#endif
return ret;
@@ -3167,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -3182,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err;
might_sleep();
+ trace_ext3_mark_inode_dirty(inode, _RET_IP_);
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (!err)
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
@@ -3195,14 +3460,14 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
{
handle_t *current_handle = ext3_journal_current_handle();
handle_t *handle;
@@ -3214,7 +3479,7 @@ void ext3_dirty_inode(struct inode *inode)
current_handle->h_transaction != handle->h_transaction) {
/* This task has a transaction open against a different fs */
printk(KERN_EMERG "%s: transactions do not match!\n",
- __FUNCTION__);
+ __func__);
} else {
jbd_debug(5, "marking dirty. outer handle=%p\n",
current_handle);