diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/ext3/inode.c |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/ext3/inode.c')
-rw-r--r-- | fs/ext3/inode.c | 3132 |
1 files changed, 3132 insertions, 0 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c new file mode 100644 index 00000000000..040eb288bb1 --- /dev/null +++ b/fs/ext3/inode.c @@ -0,0 +1,3132 @@ +/* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/ext3_jbd.h> +#include <linux/jbd.h> +#include <linux/smp_lock.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/mpage.h> +#include <linux/uio.h> +#include "xattr.h" +#include "acl.h" + +static int ext3_writepage_trans_blocks(struct inode *inode); + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ext3_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT3_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && + inode->i_blocks - ea_blocks == 0); +} + +/* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ + +int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + return ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to progress with the next chunk of a + * truncate transaction. + */ + +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext3 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + return EXT3_DATA_TRANS_BLOCKS + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ + +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +{ + jbd_debug(2, "restarting handle %p\n", handle); + return ext3_journal_restart(handle, blocks_for_truncate(inode)); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext3_delete_inode (struct inode * inode) +{ + handle_t *handle; + + if (is_bad_inode(inode)) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* If we're going to skip the normal cleanup, we still + * need to make sure that the in-core orphan linked list + * is properly cleaned up. */ + ext3_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + EXT3_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3_free_inode(handle, inode); + ext3_journal_stop(handle); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +static int ext3_alloc_block (handle_t *handle, + struct inode * inode, unsigned long goal, int *err) +{ + unsigned long result; + + result = ext3_new_block(handle, inode, goal, err); + return result; +} + + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + } + if (boundary) + *boundary = (i_block & (ptrs - 1)) == (final - 1); + return n; +} + +/** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ + +static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + unsigned long bg_start; + unsigned long colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext3_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + +static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) +{ + struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + *goal = block_i->last_alloc_physical_block + 1; + return 0; + } + + if (verify_chain(chain, partial)) { + *goal = ext3_find_near(inode, partial); + return 0; + } + return -EAGAIN; +} + +/** + * ext3_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext3_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0, keys = 0; + int err = 0; + int i; + int parent = ext3_alloc_block(handle, inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) { + for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext3_alloc_block(handle, inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + keys = n+1; + + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, parent); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + break; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + break; + + parent = nr; + } + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < keys; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + return err; +} + +/** + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + +static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, + Indirect chain[4], Indirect *where, int num) +{ + int i; + int err = 0; + struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* Verify that place we are splicing to is still there and vacant */ + + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block; + block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * akpm: If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +changed: + /* + * AKPM: if where[i].bh isn't part of the current updating + * transaction then we explode nastily. Test this code path. + */ + jbd_debug(1, "the chain changed: try again\n"); + err = -EAGAIN; + +err_out: + for (i = 1; i < num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + } + /* For the normal collision cleanup case, we free up the blocks. + * On genuine filesystem errors we don't even think about doing + * that. */ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, + le32_to_cpu(where[i].key), 1); + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * akpm: `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + */ + +static int +ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int extend_disksize) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int boundary = 0; + int depth = ext3_block_to_path(inode, iblock, offsets, &boundary); + struct ext3_inode_info *ei = EXT3_I(inode); + + J_ASSERT(handle != NULL || create == 0); + + if (depth == 0) + goto out; + +reread: + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + clear_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (boundary) + set_buffer_boundary(bh_result); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + goal = 0; + down(&ei->truncate_sem); + + /* lazy initialize the block allocation info here if necessary */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) { + ext3_init_block_alloc_info(inode); + } + + if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) { + up(&ei->truncate_sem); + goto changed; + } + + left = (chain + depth) - partial; + + /* + * Block out ext3_truncate while we alter the tree + */ + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + + /* The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); + /* i_disksize growing is protected by truncate_sem + * don't forget to protect it if you're about to implement + * concurrent ext3_get_block() -bzzz */ + if (!err && extend_disksize && inode->i_size > ei->i_disksize) + ei->i_disksize = inode->i_size; + up(&ei->truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) + goto cleanup; + + set_buffer_new(bh_result); + goto got_it; + +changed: + while (partial > chain) { + jbd_debug(1, "buffer chain changed, retrying\n"); + BUFFER_TRACE(partial->bh, "brelsing"); + brelse(partial->bh); + partial--; + } + goto reread; +} + +static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = NULL; + int ret; + + if (create) { + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } + ret = ext3_get_block_handle(handle, inode, iblock, + bh_result, create, 1); + return ret; +} + +#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) + +static int +ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create) +{ + handle_t *handle = journal_current_handle(); + int ret = 0; + + if (!handle) + goto get_block; /* A read */ + + if (handle->h_transaction->t_state == T_LOCKED) { + /* + * Huge direct-io writes can hold off commits for long + * periods of time. Let this commit run. + */ + ext3_journal_stop(handle); + handle = ext3_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + goto get_block; + } + + if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) { + /* + * Getting low on buffer credits... + */ + ret = ext3_journal_extend(handle, DIO_CREDITS); + if (ret > 0) { + /* + * Couldn't extend the transaction. Start a new one. + */ + ret = ext3_journal_restart(handle, DIO_CREDITS); + } + } + +get_block: + if (ret == 0) + ret = ext3_get_block_handle(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +} + +static int ext3_writepages_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, + long block, int create, int * errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* Now that we do not always journal data, we + should keep in mind whether this should + always journal the new buffer as metadata. + For now, regular file writes use + ext3_get_block instead, so it's not a + problem. */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } + return NULL; +} + +struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext3_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ + +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + return ext3_journal_get_write_access(handle, bh); +} + +static int ext3_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + handle_t *handle; + int retries = 0; + +retry: + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + if (test_opt(inode->i_sb, NOBH)) + ret = nobh_prepare_write(page, from, to, ext3_get_block); + else + ret = block_prepare_write(page, from, to, ext3_get_block); + if (ret) + goto prepare_write_failed; + + if (ext3_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } +prepare_write_failed: + if (ret) + ext3_journal_stop(handle); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +int +ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, + bh, handle,err); + return err; +} + +/* For commit_write() in data=journal mode */ +static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext3_journal_dirty_metadata(handle, bh); +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext3 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ + +static int ext3_ordered_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext3_journal_dirty_data); + + if (ret == 0) { + /* + * generic_commit_write() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + loff_t new_i_size; + + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; |