diff options
Diffstat (limited to 'fs/ext3/inode.c')
| -rw-r--r-- | fs/ext3/inode.c | 2031 |
1 files changed, 1264 insertions, 767 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 8824e84f8a5..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -13,63 +13,56 @@ * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 + * (sct@redhat.com), 1993, 1998 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) + * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h> -#include <linux/smp_lock.h> #include <linux/highuid.h> -#include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/mpage.h> -#include <linux/uio.h> +#include <linux/namei.h> +#include <linux/aio.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. */ -static inline int ext3_inode_is_fast_symlink(struct inode *inode) +static int ext3_inode_is_fast_symlink(struct inode *inode) { int ea_blocks = EXT3_I(inode)->i_file_acl ? (inode->i_sb->s_blocksize >> 9) : 0; - return (S_ISLNK(inode->i_mode) && - inode->i_blocks - ea_blocks == 0); + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* The ext3 forget function must perform a revoke if we are freeing data +/* + * The ext3 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. + * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ - -int ext3_forget(handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - int blocknr) +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr) { int err; might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -97,18 +90,17 @@ int ext3_forget(handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call ext3_journal_revoke"); err = ext3_journal_revoke(handle, blocknr, bh); if (err) - ext3_abort(inode->i_sb, __FUNCTION__, + ext3_abort(inode->i_sb, __func__, "error %d when attempting revoke", err); BUFFER_TRACE(bh, "exit"); return err; } /* - * Work out how many blocks we need to progress with the next chunk of a + * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ - -static unsigned long blocks_for_truncate(struct inode *inode) +static unsigned long blocks_for_truncate(struct inode *inode) { unsigned long needed; @@ -125,13 +117,13 @@ static unsigned long blocks_for_truncate(struct inode *inode) /* But we need to bound the transaction so we don't overflow the * journal. */ - if (needed > EXT3_MAX_TRANS_DATA) + if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } -/* +/* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. @@ -139,10 +131,9 @@ static unsigned long blocks_for_truncate(struct inode *inode) * start_transaction gets us a new handle for a truncate transaction, * and extend_transaction tries to extend the existing one a bit. If * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct + * transaction in the top-level truncate loop. --sct */ - -static handle_t *start_transaction(struct inode *inode) +static handle_t *start_transaction(struct inode *inode) { handle_t *result; @@ -174,29 +165,87 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) { + int ret; + jbd_debug(2, "restarting handle %p\n", handle); - return ext3_journal_restart(handle, blocks_for_truncate(inode)); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; } /* - * Called at the last iput() if i_nlink is zero. + * Called at inode eviction from icache */ -void ext3_delete_inode (struct inode * inode) +void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *rsv; handle_t *handle; + int want_delete = 0; + + trace_ext3_evict_inode(inode); + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + want_delete = 1; + } + + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages_final(&inode->i_data); - truncate_inode_pages(&inode->i_data, 0); + ext3_discard_reservation(inode); + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); - if (is_bad_inode(inode)) + if (!want_delete) goto no_delete; handle = start_transaction(inode); if (IS_ERR(handle)) { - /* If we're going to skip the normal cleanup, we still - * need to make sure that the in-core orphan linked list - * is properly cleaned up. */ + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ ext3_orphan_del(NULL, inode); goto no_delete; } @@ -207,44 +256,39 @@ void ext3_delete_inode (struct inode * inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); - /* + /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty - * fails. + * fails. */ - if (ext3_mark_inode_dirty(handle, inode)) - /* If that failed, just do the required in-core inode clear. */ + if (ext3_mark_inode_dirty(handle, inode)) { + /* If that failed, just dquot_drop() and be done with that */ + dquot_drop(inode); + clear_inode(inode); + } else { + ext3_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode(inode); - else ext3_free_inode(handle, inode); + } ext3_journal_stop(handle); return; no_delete: - clear_inode(inode); /* We must guarantee clearing of inode... */ -} - -static int ext3_alloc_block (handle_t *handle, - struct inode * inode, unsigned long goal, int *err) -{ - unsigned long result; - - result = ext3_new_block(handle, inode, goal, err); - return result; + clear_inode(inode); + dquot_drop(inode); } - typedef struct { __le32 *p; __le32 key; @@ -257,7 +301,7 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) p->bh = bh; } -static inline int verify_chain(Indirect *from, Indirect *to) +static int verify_chain(Indirect *from, Indirect *to) { while (from <= to && from->key == *from->p) from++; @@ -327,10 +371,10 @@ static int ext3_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); } if (boundary) - *boundary = (i_block & (ptrs - 1)) == (final - 1); + *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } @@ -404,13 +448,13 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same - * cylinder group. + * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode @@ -419,51 +463,50 @@ no_block: * * Caller must make sure that @ind is valid and will stay that way. */ - -static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) { struct ext3_inode_info *ei = EXT3_I(inode); __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; __le32 *p; - unsigned long bg_start; - unsigned long colour; + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) + for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); + } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* - * It is going to be refered from inode itself? OK, just put it into - * the same cylinder group then. + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. */ - bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); colour = (current->pid % 16) * (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); return bg_start + colour; } /** - * ext3_find_goal - find a prefered place for allocation. + * ext3_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want - * @chain: chain of indirect blocks * @partial: pointer to the last triple within a chain - * @goal: place to store the result. * - * Normally this function find the prefered place for block allocation, - * stores it in *@goal and returns zero. + * Normally this function find the preferred place for block allocation, + * returns it. */ -static unsigned long ext3_find_goal(struct inode *inode, long block, - Indirect chain[4], Indirect *partial) +static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, + Indirect *partial) { - struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; + + block_i = EXT3_I(inode)->i_block_alloc_info; /* * try the heuristic for sequential allocation, @@ -478,13 +521,119 @@ static unsigned long ext3_find_goal(struct inode *inode, long block, } /** + * ext3_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext3_alloc_blocks - multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: owner + * @goal: preferred place for allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of blocks need to allocated for direct blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: here we store the error value + * + * return the number of direct blocks allocated + */ +static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext3_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext3_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i <index; i++) + ext3_free_blocks(handle, inode, new_blocks[i], 1); + return ret; +} + +/** * ext3_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner - * @num: depth of the chain (number of blocks to allocate) + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * - * This function allocates @num blocks, zeroes out all but the last one, + * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in @@ -501,97 +650,111 @@ static unsigned long ext3_find_goal(struct inode *inode, long block, * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ - static int ext3_alloc_branch(handle_t *handle, struct inode *inode, - int num, - unsigned long goal, - int *offsets, - Indirect *branch) + int indirect_blks, int *blks, ext3_fsblk_t goal, + int *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; - int n = 0, keys = 0; + int i, n = 0; int err = 0; - int i; - int parent = ext3_alloc_block(handle, inode, goal, &err); - - branch[0].key = cpu_to_le32(parent); - if (parent) { - for (n = 1; n < num; n++) { - struct buffer_head *bh; - /* Allocate the next block */ - int nr = ext3_alloc_block(handle, inode, parent, &err); - if (!nr) - break; - branch[n].key = cpu_to_le32(nr); + struct buffer_head *bh; + int num; + ext3_fsblk_t new_blocks[4]; + ext3_fsblk_t current_block; - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, parent); - if (!bh) - break; - keys = n+1; - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext3_journal_get_create_access(handle, bh); - if (err) { - unlock_buffer(bh); - brelse(bh); - break; - } + num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32*) bh->b_data + offsets[n]; - *branch[n].p = branch[n].key; - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { unlock_buffer(bh); + brelse(bh); + goto failed; + } - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - break; - - parent = nr; + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); } - } - if (n == num) - return 0; + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: /* Allocation failed, free what we already allocated */ - for (i = 1; i < keys; i++) { + for (i = 1; i <= n ; i++) { BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i < keys; i++) - ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + for (i = 0; i < indirect_blks; i++) + ext3_free_blocks(handle, inode, new_blocks[i], 1); + + ext3_free_blocks(handle, inode, new_blocks[i], num); + return err; } /** - * ext3_splice_branch - splice the allocated branch onto inode. - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext3_alloc_branch) - * @where: location of missing link - * @num: number of blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. + * ext3_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. */ - -static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, - Indirect chain[4], Indirect *where, int num) +static int ext3_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) { int i; int err = 0; - struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; + ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; + block_i = ei->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -608,24 +771,39 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, *where->p = where->key; /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* * update the most recently allocated logical & physical block * in i_block_alloc_info, to assist find the proper goal block for next * allocation */ if (block_i) { - block_i->last_alloc_logical_block = block; - block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key); + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); /* had we spliced it onto indirect block? */ if (where->bh) { /* - * akpm: If we spliced it onto an indirect block, we haven't + * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect @@ -635,7 +813,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, where->bh); - if (err) + if (err) goto err_out; } else { /* @@ -647,10 +825,13 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, return err; err_out: - for (i = 1; i < num; i++) { + for (i = 1; i <= num; i++) { BUFFER_TRACE(where[i].bh, "call journal_forget"); ext3_journal_forget(handle, where[i].bh); + ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); } + ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + return err; } @@ -666,26 +847,34 @@ err_out: * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * - * akpm: `handle' can be NULL if create == 0. + * `handle' can be NULL if create == 0. * * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. */ - -static int -ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int extend_disksize) +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create) { int err = -EIO; int offsets[4]; Indirect chain[4]; Indirect *partial; - unsigned long goal; - int left; - int boundary = 0; - const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary); + ext3_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; struct ext3_inode_info *ei = EXT3_I(inode); + int count = 0; + ext3_fsblk_t first_block = 0; + + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); + depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) goto out; @@ -694,15 +883,44 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, /* Simplest case - block found, no allocation needed */ if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); - goto got_it; + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext3_fsblk_t blk; + + if (!verify_chain(chain, chain + depth - 1)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; } /* Next simple case - plain lookup or failed read of indirect block */ if (!create || err == -EIO) goto cleanup; - down(&ei->truncate_sem); + /* + * Block out ext3_truncate while we alter the tree + */ + mutex_lock(&ei->truncate_mutex); /* * If the indirect block is missing while we are reading @@ -723,7 +941,8 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, } partial = ext3_get_branch(inode, depth, offsets, chain, &err); if (!partial) { - up(&ei->truncate_sem); + count++; + mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; clear_buffer_new(bh_result); @@ -738,14 +957,18 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) ext3_init_block_alloc_info(inode); - goal = ext3_find_goal(inode, iblock, chain, partial); + goal = ext3_find_goal(inode, iblock, partial); - left = (chain + depth) - partial; + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; /* - * Block out ext3_truncate while we alter the tree + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. */ - err = ext3_alloc_branch(handle, inode, left, goal, + count = ext3_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); /* @@ -756,24 +979,18 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext3_splice_branch(handle, inode, iblock, chain, - partial, left); - /* - * i_disksize growing is protected by truncate_sem. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; - up(&ei->truncate_sem); + err = ext3_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); - if (boundary) + if (count > blocks_to_boundary) set_buffer_boundary(bh_result); + err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: @@ -784,75 +1001,66 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 +/* + * Number of credits we need for writing DIO_MAX_BLOCKS: + * We need sb + group descriptor + bitmap + inode -> 4 + * For B blocks with A block pointers per block we need: + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. + */ +#define DIO_CREDITS 25 + static int ext3_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = NULL; - int ret; + handle_t *handle = ext3_journal_current_handle(); + int ret = 0, started = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + if (create && !handle) { /* Direct IO write... */ + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + handle = ext3_journal_start(inode, DIO_CREDITS + + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; + } - if (create) { - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); + ret = ext3_get_blocks_handle(handle, inode, iblock, + max_blocks, bh_result, create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; } - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 1); + if (started) + ext3_journal_stop(handle); +out: return ret; } -#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) - -static int -ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create) +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { - handle_t *handle = journal_current_handle(); - int ret = 0; - - if (!handle) - goto get_block; /* A read */ - - if (handle->h_transaction->t_state == T_LOCKED) { - /* - * Huge direct-io writes can hold off commits for long - * periods of time. Let this commit run. - */ - ext3_journal_stop(handle); - handle = ext3_journal_start(inode, DIO_CREDITS); - if (IS_ERR(handle)) - ret = PTR_ERR(handle); - goto get_block; - } - - if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) { - /* - * Getting low on buffer credits... - */ - ret = ext3_journal_extend(handle, DIO_CREDITS); - if (ret > 0) { - /* - * Couldn't extend the transaction. Start a new one. - */ - ret = ext3_journal_restart(handle, DIO_CREDITS); - } - } - -get_block: - if (ret == 0) - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); - return ret; + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); } /* * `handle' can be NULL if create is zero */ -struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, - long block, int create, int * errp) +struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) { struct buffer_head dummy; int fatal = 0, err; @@ -862,29 +1070,40 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { + err = ext3_get_blocks_handle(handle, inode, block, 1, + &dummy, create); + /* + * ext3_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + WARN_ON(err > 1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { J_ASSERT(create != 0); - J_ASSERT(handle != 0); - - /* Now that we do not always journal data, we - should keep in mind whether this should - always journal the new buffer as metadata. - For now, regular file writes use - ext3_get_block instead, so it's not a - problem. */ + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext3_get_block instead, so it's not a + * problem. + */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); fatal = ext3_journal_get_create_access(handle, bh); if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); + memset(bh->b_data,0,inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); @@ -906,7 +1125,7 @@ err: return NULL; } -struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, +struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, int block, int create, int *err) { struct buffer_head * bh; @@ -914,9 +1133,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; - if (buffer_uptodate(bh)) + if (bh_uptodate_or_lock(bh)) return bh; - ll_rw_block(READ, 1, &bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -941,7 +1162,7 @@ static int walk_page_buffers( handle_t *handle, for ( bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) + block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; @@ -980,63 +1201,143 @@ static int walk_page_buffers( handle_t *handle, * So what we do is to rely on the fact that journal_stop/journal_start * will _not_ run commit under these circumstances because handle->h_ref * is elevated. We'll still have enough credits for the tiny quotafile - * write. + * write. */ - -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext3_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; } -static int ext3_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) { - struct inode *inode = page->mapping->host; - int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + +static int ext3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret; handle_t *handle; int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + + trace_ext3_write_begin(inode, pos, len, flags); + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; retry: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + handle = ext3_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { + unlock_page(page); + page_cache_release(page); ret = PTR_ERR(handle); goto out; } - if (test_opt(inode->i_sb, NOBH)) - ret = nobh_prepare_write(page, from, to, ext3_get_block); - else - ret = block_prepare_write(page, from, to, ext3_get_block); + ret = __block_write_begin(page, pos, len, ext3_get_block); if (ret) - goto prepare_write_failed; + goto write_begin_failed; if (ext3_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, do_journal_get_write_access); } -prepare_write_failed: - if (ret) +write_begin_failed: + if (ret) { + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; out: return ret; } -int -ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = journal_dirty_data(handle, bh); if (err) - ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, - bh, handle,err); + ext3_journal_abort_handle(__func__, __func__, + bh, handle, err); return err; } -/* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; @@ -1045,99 +1346,150 @@ static int commit_write_fn(handle_t *handle, struct buffer_head *bh) } /* + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. + */ +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) +{ + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; + mark_inode_dirty(inode); + } +} + +/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext3 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ - -static int ext3_ordered_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = file->f_mapping->host; + unsigned from, to; int ret = 0, ret2; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext3_journal_dirty_data); + trace_ext3_ordered_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - if (ret == 0) { - /* - * generic_commit_write() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - loff_t new_i_size; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, journal_dirty_data_fn); - new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; - ret = generic_commit_write(file, page, from, to); - } + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; - return ret; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; } -static int ext3_writeback_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; - int ret = 0, ret2; - loff_t new_i_size; - - new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - if (new_i_size > EXT3_I(inode)->i_disksize) - EXT3_I(inode)->i_disksize = new_i_size; + struct inode *inode = file->f_mapping->host; + int ret; - if (test_opt(inode->i_sb, NOBH)) - ret = nobh_commit_write(file, page, from, to); - else - ret = generic_commit_write(file, page, from, to); + trace_ext3_writeback_write_end(inode, pos, len, copied); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; - return ret; + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; } -static int ext3_journalled_commit_write(struct file *file, - struct page *page, unsigned from, unsigned to) +static int ext3_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; - loff_t pos; + unsigned from, to; - /* - * Here we duplicate the generic_commit_write() functionality - */ - pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + trace_ext3_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from + copied, to); + to = from + copied; + } ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, commit_write_fn); + to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos > inode->i_size) - i_size_write(inode, pos); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); - if (!ret) + if (!ret) ret = ret2; } + ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; - return ret; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; } -/* +/* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * @@ -1146,10 +1498,10 @@ static int ext3_journalled_commit_write(struct file *file, * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. + * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. + * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) { @@ -1157,17 +1509,17 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { - /* + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { + /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. + * do we expect this to happen. * * (bmap requires CAP_SYS_RAWIO so this does not * represent an unprivileged user DOS attack --- we'd be * in trouble if mortal users could trigger this path at - * will.) + * will.) * * NB. EXT3_STATE_JDATA is not set on files other than * regular files. If somebody wants to bmap a directory @@ -1176,7 +1528,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1201,67 +1553,26 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) { - if (buffer_mapped(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; + return !buffer_mapped(bh); } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_sem. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct buffer_head *page_bufs; @@ -1270,6 +1581,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1278,6 +1596,20 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1285,11 +1617,6 @@ static int ext3_ordered_writepage(struct page *page, goto out_fail; } - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - page_bufs = page_buffers(page); walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bget_one); @@ -1303,16 +1630,13 @@ static int ext3_ordered_writepage(struct page *page, */ /* - * And attach them to the current transaction. But only if + * And attach them to the current transaction. But only if * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1334,19 +1658,35 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_fail; } - if (test_opt(inode->i_sb, NOBH)) - ret = nobh_writepage(page, ext3_get_block, wbc); - else - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, ext3_get_block, wbc); err = ext3_journal_stop(handle); if (!ret) @@ -1367,23 +1707,33 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; - if (ext3_journal_current_handle()) - goto no_write; - - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + trace_ext3_journalled_writepage(page); if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); if (ret != 0) { ext3_journal_stop(handle); goto out_unlock; @@ -1392,22 +1742,25 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, commit_write_fn); + PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1420,6 +1773,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1430,23 +1784,27 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static int ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset, length); + /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - return journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1459,54 +1817,78 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * if the machine crashes during the write. * * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct ext3_inode_info *ei = EXT3_I(inode); - handle_t *handle = NULL; + handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); + int retries = 0; + + trace_ext3_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE) { loff_t final_size = offset + count; - handle = ext3_journal_start(inode, DIO_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } ret = ext3_orphan_add(handle, inode); - if (ret) - goto out_stop; + if (ret) { + ext3_journal_stop(handle); + goto out; + } orphan = 1; ei->i_disksize = inode->i_size; + ext3_journal_stop(handle); } } - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext3_direct_io_get_blocks, NULL); - +retry: + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); /* - * Reacquire the handle: ext3_direct_io_get_block() can restart the - * transaction + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. */ - handle = journal_current_handle(); + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; -out_stop: - if (handle) { + if (end > isize) + ext3_truncate_failed_direct_write(inode); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { int err; - if (orphan && inode->i_nlink) + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate_failed_direct_write(inode); + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + goto out; + } + if (inode->i_nlink) ext3_orphan_del(handle, inode); - if (orphan && ret > 0) { + if (ret > 0) { loff_t end = offset + ret; if (end > inode->i_size) { ei->i_disksize = end; @@ -1526,6 +1908,7 @@ out_stop: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -1548,43 +1931,49 @@ static int ext3_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static struct address_space_operations ext3_ordered_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_ordered_writepage, - .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_ordered_commit_write, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, +static const struct address_space_operations ext3_ordered_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, + .error_remove_page = generic_error_remove_page, }; -static struct address_space_operations ext3_writeback_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_writeback_writepage, - .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_writeback_commit_write, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, +static const struct address_space_operations ext3_writeback_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; -static struct address_space_operations ext3_journalled_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_journalled_writepage, - .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_journalled_commit_write, - .set_page_dirty = ext3_journalled_set_page_dirty, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, +static const struct address_space_operations ext3_journalled_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void ext3_set_aops(struct inode *inode) @@ -1603,36 +1992,27 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { - unsigned long index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; - void *kaddr; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { - if (PageUptodate(page)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_page_dirty(page); - goto unlock; - } - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -1665,27 +2045,33 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err) goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - + zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); err = 0; @@ -1696,6 +2082,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -1728,7 +2117,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q) * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is refered + * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation @@ -1751,17 +2140,14 @@ static inline int all_zeroes(__le32 *p, __le32 *q) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ -static Indirect *ext3_find_shared(struct inode *inode, - int depth, - int offsets[4], - Indirect chain[4], - __le32 *top) +static Indirect *ext3_find_shared(struct inode *inode, int depth, + int offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; - /* Make k index the deepest non-null offest + 1 */ + /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext3_get_branch(inode, k, offsets, chain, &err); @@ -1794,8 +2180,7 @@ static Indirect *ext3_find_shared(struct inode *inode, } /* Writer: end */ - while(partial > p) - { + while(partial > p) { brelse(partial->bh); partial--; } @@ -1811,22 +2196,23 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void -ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, - unsigned long block_to_free, unsigned long count, - __le32 *first, __le32 *last) +static void ext3_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) { __le32 *p; if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + if (ext3_journal_dirty_metadata(handle, bh)) + return; } ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext3_journal_get_write_access(handle, bh); + if (ext3_journal_get_write_access(handle, bh)) + return; } } @@ -1860,7 +2246,7 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, * @first: array of block numbers * @last: points immediately past the end of array * - * We are freeing all blocks refered from that array (numbers are stored as + * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these @@ -1875,12 +2261,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { - unsigned long block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ + ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ - unsigned long nr; /* Current block # */ + ext3_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err; @@ -1905,7 +2291,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext3_clear_blocks(handle, inode, this_bh, + ext3_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); block_to_free = nr; @@ -1921,7 +2307,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); } } @@ -1934,7 +2334,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, * @last: pointer immediately past the end of array * @depth: depth of the branches to free * - * We are freeing all blocks refered from these branches (numbers are + * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ @@ -1942,7 +2342,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { - unsigned long nr; + ext3_fsblk_t nr; __le32 *p; if (is_handle_aborted(handle)) @@ -1966,7 +2366,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, */ if (!bh) { ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%ld, block=%ld", + "Read failure, inode=%lu, block="E3FSBLK, inode->i_ino, nr); continue; } @@ -1979,27 +2379,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -2019,9 +2398,34 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); } + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + ext3_free_blocks(handle, inode, nr, 1); if (parent_bh) { @@ -2035,7 +2439,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, *p = 0; BUFFER_TRACE(parent_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, + ext3_journal_dirty_metadata(handle, parent_bh); } } @@ -2047,6 +2451,17 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, } } +int ext3_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + /* * ext3_truncate() * @@ -2054,7 +2469,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, * transaction, and VFS/VM ensures that ext3_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -2075,14 +2490,12 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, * that's fine - as long as they are linked from the inode, the post-crash * ext3_truncate() run will find them and release them. */ - -void ext3_truncate(struct inode * inode) +void ext3_truncate(struct inode *inode) { handle_t *handle; struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2090,47 +2503,21 @@ void ext3_truncate(struct inode * inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext3_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; + trace_ext3_truncate_enter(inode); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } + if (!ext3_can_truncate(inode)) + goto out_notrans; + + if (inode->i_size == 0 && ext3_should_writeback_data(inode)) + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } - return; /* AKPM: return what? */ - } + if (IS_ERR(handle)) + goto out_notrans; last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2160,7 +2547,7 @@ void ext3_truncate(struct inode * inode) * From here we block out all ext3_get_block() callers who want to * modify the block allocation tree. */ - down(&ei->truncate_sem); + mutex_lock(&ei->truncate_mutex); if (n == 1) { /* direct blocks */ ext3_free_data(handle, inode, NULL, i_data+offsets[0], @@ -2182,7 +2569,6 @@ void ext3_truncate(struct inode * inode) */ } else { /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); ext3_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); @@ -2200,39 +2586,38 @@ void ext3_truncate(struct inode * inode) do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { - default: - nr = i_data[EXT3_IND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 1); - i_data[EXT3_IND_BLOCK] = 0; - } - case EXT3_IND_BLOCK: - nr = i_data[EXT3_DIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 2); - i_data[EXT3_DIND_BLOCK] = 0; - } - case EXT3_DIND_BLOCK: - nr = i_data[EXT3_TIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 3); - i_data[EXT3_TIND_BLOCK] = 0; - } - case EXT3_TIND_BLOCK: - ; + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; } ext3_discard_reservation(inode); - up(&ei->truncate_sem); + mutex_unlock(&ei->truncate_mutex); inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); - /* In a multi-transaction truncate, we only make the final - * transaction synchronous */ + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ if (IS_SYNC(inode)) handle->h_sync = 1; out_stop: @@ -2240,57 +2625,52 @@ out_stop: * If this was a simple ftruncate(), and the file will remain alive * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by - * ext3_delete_inode(), and we allow that function to clean up the + * ext3_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); + return; +out_notrans: + /* + * Delete the inode from orphan list so that it doesn't stay there + * forever and trigger assertion on umount. + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } -static unsigned long ext3_get_inode_block(struct super_block *sb, +static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, unsigned long ino, struct ext3_iloc *iloc) { - unsigned long desc, group_desc, block_group; - unsigned long offset, block; - struct buffer_head *bh; - struct ext3_group_desc * gdp; - + unsigned long block_group; + unsigned long offset; + ext3_fsblk_t block; + struct ext3_group_desc *gdp; - if ((ino != EXT3_ROOT_INO && - ino != EXT3_JOURNAL_INO && - ino != EXT3_RESIZE_INO && - ino < EXT3_FIRST_INO(sb)) || - ino > le32_to_cpu( - EXT3_SB(sb)->s_es->s_inodes_count)) { - ext3_error (sb, "ext3_get_inode_block", - "bad inode number: %lu", ino); + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ return 0; } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - if (block_group >= EXT3_SB(sb)->s_groups_count) { - ext3_error (sb, "ext3_get_inode_block", - "group >= groups count"); - return 0; - } - smp_rmb(); - group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); - desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); - bh = EXT3_SB(sb)->s_group_desc[group_desc]; - if (!bh) { - ext3_error (sb, "ext3_get_inode_block", - "Descriptor not loaded"); + gdp = ext3_get_group_desc(sb, block_group, NULL); + if (!gdp) return 0; - } - - gdp = (struct ext3_group_desc *) bh->b_data; /* * Figure out the offset within the block group inode table */ offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp[desc].bg_inode_table) + + block = le32_to_cpu(gdp->bg_inode_table) + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); iloc->block_group = block_group; @@ -2307,7 +2687,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb, static int __ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem) { - unsigned long block; + ext3_fsblk_t block; struct buffer_head *bh; block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); @@ -2315,14 +2695,25 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", inode->i_ino, block); - return -EIO; + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); @@ -2358,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -2392,14 +2783,15 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", + "inode=%lu, block="E3FSBLK, inode->i_ino, block); brelse(bh); return -EIO; @@ -2414,7 +2806,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2434,39 +2826,70 @@ void ext3_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } -void ext3_read_inode(struct inode * inode) +/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ +void ext3_get_inode_flags(struct ext3_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| + EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; +} + +struct inode *ext3_iget(struct super_block *sb, unsigned long ino) { struct ext3_iloc iloc; struct ext3_inode *raw_inode; - struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_inode_info *ei; struct buffer_head *bh; + struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; + long ret; int block; + uid_t i_uid; + gid_t i_gid; -#ifdef CONFIG_EXT3_FS_POSIX_ACL - ei->i_acl = EXT3_ACL_NOT_CACHED; - ei->i_default_acl = EXT3_ACL_NOT_CACHED; -#endif + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT3_I(inode); ei->i_block_alloc_info = NULL; - if (__ext3_get_inode_loc(inode, &iloc, 0)) + ret = __ext3_get_inode_loc(inode, &iloc, 0); + if (ret < 0) goto bad_inode; bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -2479,6 +2902,7 @@ void ext3_read_inode(struct inode * inode) !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { /* this inode is deleted */ brelse (bh); + ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have @@ -2486,9 +2910,6 @@ void ext3_read_inode(struct inode * inode) * recovery code: that's fine, we're about to complete * the process of deleting those. */ } - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size - * (for stat), not the fs block - * size */ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); #ifdef EXT3_FRAGMENTS @@ -2514,6 +2935,30 @@ void ext3_read_inode(struct inode * inode) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { /* @@ -2523,8 +2968,11 @@ void ext3_read_inode(struct inode * inode) */ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT3_INODE_SIZE(inode->i_sb)) + EXT3_INODE_SIZE(inode->i_sb)) { + brelse (bh); + ret = -EIO; goto bad_inode; + } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext3_inode) - @@ -2534,7 +2982,7 @@ void ext3_read_inode(struct inode * inode) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2547,9 +2995,11 @@ void ext3_read_inode(struct inode * inode) inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) + if (ext3_inode_is_fast_symlink(inode)) { inode->i_op = &ext3_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); } @@ -2558,17 +3008,18 @@ void ext3_read_inode(struct inode * inode) if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); - else + else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } brelse (iloc.bh); ext3_set_inode_flags(inode); - return; + unlock_new_inode(inode); + return inode; bad_inode: - make_bad_inode(inode); - return; + iget_failed(inode); + return ERR_PTR(ret); } /* @@ -2578,47 +3029,62 @@ bad_inode: * * The caller must have write access to iloc->bh. */ -static int ext3_do_update_inode(handle_t *handle, - struct inode *inode, +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, struct ext3_iloc *iloc) { struct ext3_inode *raw_inode = ext3_raw_inode(iloc); struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; + uid_t i_uid; + gid_t i_gid; + +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -2634,8 +3100,11 @@ static int ext3_do_update_inode(handle_t *handle, if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -2645,17 +3114,20 @@ static int ext3_do_update_inode(handle_t *handle, /* If this is the first large file * created, add a flag to the superblock. */ + unlock_buffer(bh); err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); if (err) goto out_brelse; + ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; } } } @@ -2678,11 +3150,15 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); @@ -2694,21 +3170,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -2720,22 +3195,27 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { - jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } - if (!wait) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); @@ -2754,7 +3234,7 @@ int ext3_write_inode(struct inode *inode, int wait) * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will - * leave these blocks visible to the user.) + * leave these blocks visible to the user.) * * Called with inode->sem down. */ @@ -2768,19 +3248,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; @@ -2795,6 +3277,9 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) ext3_journal_stop(handle); } + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { handle_t *handle; @@ -2806,23 +3291,43 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } - rc = inode_setattr(inode, attr); + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); + } - /* If inode_setattr's call to ext3_truncate failed to get a - * transaction handle at all, we need to clean up the in-core - * orphan list manually. */ - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); + setattr_copy(inode, attr); + mark_inode_dirty(inode); - if (!rc && (ia_valid & ATTR_MODE)) - rc = ext3_acl_chmod(inode); + if (ia_valid & ATTR_MODE) + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); @@ -2833,7 +3338,7 @@ err_out: /* - * akpm: how many blocks doth make a writepage()? + * How many blocks doth make a writepage()? * * With N blocks per page, it may be: * N data blocks @@ -2868,12 +3373,12 @@ static int ext3_writepage_trans_blocks(struct inode *inode) if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else - ret = 2 * (bpp + indirects) + 2; + ret = 2 * (bpp + indirects) + indirects + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; @@ -2897,13 +3402,13 @@ int ext3_mark_iloc_dirty(handle_t *handle, return err; } -/* +/* * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. + * iloc->bh. This _must_ be cleaned up later. */ int -ext3_reserve_inode_write(handle_t *handle, struct inode *inode, +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext3_iloc *iloc) { int err = 0; @@ -2923,8 +3428,8 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, } /* - * akpm: What we do here is to mark the in-core inode as clean - * with respect to inode dirtiness (it may still be data-dirty). + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which @@ -2934,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) { @@ -2949,6 +3446,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); @@ -2956,20 +3454,20 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) } /* - * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * ext3_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ -void ext3_dirty_inode(struct inode *inode) +void ext3_dirty_inode(struct inode *inode, int flags) { handle_t *current_handle = ext3_journal_current_handle(); handle_t *handle; @@ -2981,7 +3479,7 @@ void ext3_dirty_inode(struct inode *inode) current_handle->h_transaction != handle->h_transaction) { /* This task has a transaction open against a different fs */ printk(KERN_EMERG "%s: transactions do not match!\n", - __FUNCTION__); + __func__); } else { jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); @@ -2992,16 +3490,15 @@ out: return; } -#ifdef AKPM -/* +#if 0 +/* * Bind an inode's backing buffer_head into this transaction, to prevent * it from being flushed to disk early. Unlike * ext3_reserve_inode_write, this leaves behind no bh reference and * returns no iloc structure, so the caller needs to repeat the iloc * lookup to mark the inode dirty later. */ -static inline int -ext3_pin_inode(handle_t *handle, struct inode *inode) +static int ext3_pin_inode(handle_t *handle, struct inode *inode) { struct ext3_iloc iloc; @@ -3012,7 +3509,7 @@ ext3_pin_inode(handle_t *handle, struct inode *inode) BUFFER_TRACE(iloc.bh, "get_write_access"); err = journal_get_write_access(handle, iloc.bh); if (!err) - err = ext3_journal_dirty_metadata(handle, + err = ext3_journal_dirty_metadata(handle, iloc.bh); brelse(iloc.bh); } @@ -3039,7 +3536,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT3_JOURNAL(inode); - if (is_journal_aborted(journal) || IS_RDONLY(inode)) + if (is_journal_aborted(journal)) return -EROFS; journal_lock_updates(journal); |
