diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 10:29:56 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-04-27 10:29:56 -0700 |
commit | ea6db58f3ea55f413c882095d2afaea8137f4f8c (patch) | |
tree | 9f7509b5dfe0fdd422b3e2b3a98ed8321d796c66 /fs | |
parent | c58b8e4a25a1ba347a0e5d21984c97bd296f1691 (diff) | |
parent | 83418978827324918a8cd25ce5227312de1d4468 (diff) |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (27 commits)
ocfs2: Cache extent records
ocfs2: Remember rw lock level during direct io
ocfs2: Fix up i_blocks calculation to know about holes
ocfs2: Fix extent lookup to return true size of holes
ocfs2: Read from an unwritten extent returns zeros
ocfs2: make room for unwritten extents flag
ocfs2: Use own splice write actor
ocfs2: Use do_sync_mapping_range() in ocfs2_zero_tail_for_truncate()
[PATCH] Turn do_sync_file_range() into do_sync_mapping_range()
ocfs2: zero tail of sparse files on truncate
ocfs2: Teach ocfs2_get_block() about holes
ocfs2: remove ocfs2_prepare_write() and ocfs2_commit_write()
ocfs2: teach ocfs2_file_aio_write() about sparse files
ocfs2: Turn off shared writeable mmap for local files systems with holes.
ocfs2: abstract out allocation locking
ocfs2: teach extend/truncate about sparse files
ocfs2: temporarily remove extent map caching
ocfs2: sparse b-tree support
ocfs2: small cleanup of ocfs2_request_delete()
ocfs2: remove unused code
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ocfs2/alloc.c | 3037 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 27 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 1011 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 77 | ||||
-rw-r--r-- | fs/ocfs2/cluster/quorum.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 15 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmdomain.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 143 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.c | 1233 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.h | 39 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 637 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 199 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 23 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 24 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/mmap.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 23 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 55 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 31 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_lockid.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/slot_map.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/suballoc.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/vote.c | 289 | ||||
-rw-r--r-- | fs/ocfs2/vote.h | 3 | ||||
-rw-r--r-- | fs/sync.c | 8 |
30 files changed, 4690 insertions, 2235 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf..a0c8667caa7 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -27,6 +27,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/swap.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -34,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "aops.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -47,63 +49,243 @@ #include "buffer_head_io.h" -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno); +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - int wanted, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head *bhs[]); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, - struct ocfs2_alloc_context *meta_ac); +#define OCFS2_MAX_PATH_DEPTH 5 -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_new_eb_bh); +struct ocfs2_path { + int p_tree_depth; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 blkno, - u32 new_clusters); +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head **target_bh); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - unsigned int new_i_clusters, - struct buffer_head *old_last_eb, - struct buffer_head **new_last_eb); + if (keep_root) + start = 1; + + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; + + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + + path->p_tree_depth = depth; +} + +static void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + } + + return path; +} + +/* + * Allocate and initialize a new path based on a disk inode tree. + */ +static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + + return ocfs2_new_path(di_bh, el); +} + +/* + * Convenience function to journal all components in a path. + */ +static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT +}; -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno) +/* + * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and + * ocfs2_extent_contig only work properly against leaf nodes! + */ +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + u64 blk_end = le64_to_cpu(ext->e_blkno); + + blk_end += ocfs2_clusters_to_blocks(sb, + le16_to_cpu(ext->e_leaf_clusters)); + + return blkno == blk_end; +} + +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + u32 left_range; + + left_range = le32_to_cpu(left->e_cpos) + + le16_to_cpu(left->e_leaf_clusters); + + return (left_range == le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { - return blkno == (le64_to_cpu(ext->e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(ext->e_clusters))); + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; } /* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +struct ocfs2_insert_type { + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_free_records; + int ins_tree_depth; +}; + +/* * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, @@ -242,6 +424,28 @@ bail: } /* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -250,7 +454,7 @@ bail: * for the new last extent block. * * the new branch will be 'empty' in the sense that every block will - * contain a single record with e_clusters == 0. + * contain a single record with cluster count == 0. */ static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, @@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; + u32 new_cpos; mlog_entry_void(); @@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); - eb_el->l_recs[0].e_cpos = fe->i_clusters; + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); - eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + /* + * eb_el isn't always an interior node, but even leaf + * nodes want a zero'd flags and reserved field so + * this gets the whole 32 bits regardless of use. + */ + eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); @@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); - el->l_recs[i].e_cpos = fe->i_clusters; - el->l_recs[i].e_clusters = 0; + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + el->l_recs[i].e_int_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); /* fe needs a new last extent block pointer, as does the @@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct buffer_head **ret_new_eb_bh) { int status, i; + u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, /* copy the fe data into the new extent block */ eb_el->l_tree_depth = fe_el->l_tree_depth; eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; - eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; - eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; - } + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + eb_el->l_recs[i] = fe_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_clusters = fe->i_clusters; - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - fe_el->l_recs[i].e_cpos = 0; - fe_el->l_recs[i].e_clusters = 0; - fe_el->l_recs[i].e_blkno = 0; - } + fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); fe_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk @@ -515,199 +729,6 @@ bail: } /* - * Expects the tree to already have room in the rightmost leaf for the - * extent. Updates all the extent blocks (and the dinode) on the way - * down. - */ -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters) -{ - int status, i, num_bhs = 0; - u64 next_blkno; - u16 next_free; - struct buffer_head **eb_bhs = NULL; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - - mlog_entry_void(); - - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; - if (el->l_tree_depth) { - /* This is another operation where we want to be - * careful about our tree updates. An error here means - * none of the previous changes we made should roll - * forward. As a result, we have to record the buffers - * for this part of the tree in an array and reserve a - * journal write to them before making any changes. */ - num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); - eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!eb_bhs) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - i = 0; - while(el->l_tree_depth) { - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); - - BUG_ON(i >= num_bhs); - status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); - status = -EIO; - goto bail; - } - - status = ocfs2_journal_access(handle, inode, eb_bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - el = &eb->h_list; - i++; - /* When we leave this loop, eb_bhs[num_bhs - 1] will - * hold the bottom-most leaf extent block. */ - } - BUG_ON(el->l_tree_depth); - - el = &fe->id2.i_list; - /* If we have tree depth, then the fe update is - * trivial, and we want to switch el out for the - * bottom-most leaf in order to update it with the - * actual extent data below. */ - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - /* (num_bhs - 1) to avoid the leaf */ - for(i = 0; i < (num_bhs - 1); i++) { - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - el = &eb->h_list; - - /* finally, make our actual change to the - * intermediate extent blocks. */ - next_free = le16_to_cpu(el->l_next_free_rec); - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - - status = ocfs2_journal_dirty(handle, eb_bhs[i]); - if (status < 0) - mlog_errno(status); - } - BUG_ON(i != (num_bhs - 1)); - /* note that the leaf block wasn't touched in - * the loop above */ - eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; - el = &eb->h_list; - BUG_ON(el->l_tree_depth); - } - - /* yay, we can finally add the actual extent now! */ - i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) && - ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { - le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); - } else if (le16_to_cpu(el->l_next_free_rec) && - (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { - /* having an empty extent at eof is legal. */ - if (el->l_recs[i].e_cpos != fe->i_clusters) { - ocfs2_error(inode->i_sb, - "Dinode %llu trailing extent is bad: " - "cpos (%u) != number of clusters (%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(fe->i_clusters)); - status = -EIO; - goto bail; - } - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - } else { - /* No contiguous record, or no empty record at eof, so - * we add a new one. */ - - BUG_ON(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count)); - i = le16_to_cpu(el->l_next_free_rec); - - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - el->l_recs[i].e_cpos = fe->i_clusters; - le16_add_cpu(&el->l_next_free_rec, 1); - } - - /* - * extent_map errors are not fatal, so they are ignored outside - * of flushing the thing. - */ - status = ocfs2_extent_map_append(inode, &el->l_recs[i], - new_clusters); - if (status) { - mlog_errno(status); - ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); - if (status < 0) - mlog_errno(status); - } - - status = 0; -bail: - if (eb_bhs) { - for (i = 0; i < num_bhs; i++) - if (eb_bhs[i]) - brelse(eb_bhs[i]); - kfree(eb_bhs); - } - - mlog_exit(status); - return status; -} - -/* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three @@ -807,53 +828,1548 @@ bail: return status; } -/* the caller needs to update fe->i_clusters */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters, - struct ocfs2_alloc_context *meta_ac) +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) { - int status, i, shift; - struct buffer_head *last_eb_bh = NULL; + return !rec->e_leaf_clusters; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + if (el->l_next_free_rec == el->l_count && !has_empty) + BUG(); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", + insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +static int ocfs2_find_subtree_root(struct inode *inode, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Inode %lu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + inode->i_ino, left->p_tree_depth, + right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; struct buffer_head *bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent list at " + "depth %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; - mlog(0, "add %u clusters starting at block %llu to inode %llu\n", - new_clusters, (unsigned long long)start_blk, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + |