diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 3233 |
1 files changed, 3233 insertions, 0 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c new file mode 100644 index 00000000000..0a60ec5a16d --- /dev/null +++ b/fs/ext4/inode.c @@ -0,0 +1,3233 @@ +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/ext4_jbd2.h> +#include <linux/jbd2.h> +#include <linux/smp_lock.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/mpage.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include "xattr.h" +#include "acl.h" + +/* + * Test whether an inode is a fast symlink. + */ +static int ext4_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT4_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + return ext4_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext4_journal_revoke"); + err = ext4_journal_revoke(handle, blocknr, bh); + if (err) + ext4_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) +{ + jbd_debug(2, "restarting handle %p\n", handle); + return ext4_journal_restart(handle, blocks_for_truncate(inode)); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_delete_inode (struct inode * inode) +{ + handle_t *handle; + + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext4_truncate(inode); + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big"); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + ext4_fsblk_t bg_start; + ext4_grpblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext4_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. + */ + +static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block, + Indirect chain[4], Indirect *partial) +{ + struct ext4_block_alloc_info *block_i; + + block_i = EXT4_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext4_find_near(inode, partial); +} + +/** + * ext4_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i <index; i++) + ext4_free_blocks(handle, inode, new_blocks[i], 1); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + int indirect_blks, int *blks, ext4_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + for (i = 1; i <= n ; i++) { + BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); + ext4_journal_forget(handle, branch[i].bh); + } + for (i = 0; i <indirect_blks; i++) + ext4_free_blocks(handle, inode, new_blocks[i], 1); + + ext4_free_blocks(handle, inode, new_blocks[i], num); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + int err = 0; + struct ext4_block_alloc_info *block_i; + ext4_fsblk_t current_block; + + block_i = EXT4_I(inode)->i_block_alloc_info; + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + ext4_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); + ext4_journal_forget(handle, where[i].bh); + ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); + } + ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext4_inode_info *ei = EXT4_I(inode); + int count = 0; + ext4_fsblk_t first_block = 0; + + + J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); + J_ASSERT(handle != NULL || create == 0); + depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + if (!verify_chain(chain, partial)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + + /* + * If the indirect block is missing while we are reading + * the chain(ext4_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext4_init_block_alloc_info(inode); + + goal = ext4_find_goal(inode, iblock, chain, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + /* + * i_disksize growing is protected by truncate_mutex. Don't forget to + * protect it if you're about to implement concurrent + * ext4_get_block() -bzzz + */ + if (!err && extend_disksize && inode->i_size > ei->i_disksize) + ei->i_disksize = inode->i_size; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; +} + +#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) + +static int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = journal_current_handle(); + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + if (!create) + goto get_block; /* A read */ + + if (max_blocks == 1) + goto get_block; /* A single block get */ + + if (handle->h_transaction->t_state == T_LOCKED) { + /* + * Huge direct-io writes can hold off commits for long + * periods of time. Let this commit run. + */ + ext4_journal_stop(handle); + handle = ext4_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + goto get_block; + } + + if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) { + /* + * Getting low on buffer credits... + */ + ret = ext4_journal_extend(handle, DIO_CREDITS); + if (ret > 0) { + /* + * Couldn't extend the transaction. Start a new one. + */ + ret = ext4_journal_restart(handle, DIO_CREDITS); + } + } + +get_block: + if (ret == 0) { + ret = ext4_get_blocks_wrap(handle, inode, iblock, + max_blocks, bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + } + return ret; +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + err = ext4_get_blocks_wrap(handle, inode, block, 1, + &dummy, create, 1); + /* + * ext4_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (!bh) { + *errp = -EIO; + goto err; + } + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data,0,inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } +err: + return NULL; +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext4_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the commit_write(). So doing the jbd2_journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext4_writepage() -> + * block_write_fu |