diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-01-16 11:32:23 -0800 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-04-26 14:44:03 -0700 |
commit | dcd0538ff4e854fa9d7f4630b359ca8fdb5cb5a8 (patch) | |
tree | 226d725f8199907cea2433d1d183b01e51d9bc55 /fs/ocfs2 | |
parent | 6f16bf655c5795586dd2ac96a7c70e0b9a378746 (diff) |
ocfs2: sparse b-tree support
Introduce tree rotations into the b-tree code. This will allow ocfs2 to
support sparse files. Much of the added code is designed to be generic (in
the ocfs2 sense) so that it can later be re-used to implement large
extended attributes.
This patch only adds the rotation code and does minimal updates to callers
of the extent api.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 2446 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 23 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 1 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 7 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 8 |
8 files changed, 2002 insertions, 497 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf..a9669686757 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -47,62 +47,230 @@ #include "buffer_head_io.h" -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno); +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - int wanted, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head *bhs[]); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, - struct ocfs2_alloc_context *meta_ac); +#define OCFS2_MAX_PATH_DEPTH 5 -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_new_eb_bh); +struct ocfs2_path { + int p_tree_depth; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 blkno, - u32 new_clusters); +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head **target_bh); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - unsigned int new_i_clusters, - struct buffer_head *old_last_eb, - struct buffer_head **new_last_eb); + if (keep_root) + start = 1; -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno) + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + + path->p_tree_depth = depth; +} + +static void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + } + + return path; +} + +/* + * Allocate and initialize a new path based on a disk inode tree. + */ +static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + + return ocfs2_new_path(di_bh, el); +} + +/* + * Convenience function to journal all components in a path. + */ +static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT +}; + +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) { return blkno == (le64_to_cpu(ext->e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, + ocfs2_clusters_to_blocks(sb, le32_to_cpu(ext->e_clusters))); } +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) == + le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; +} + +/* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +struct ocfs2_insert_type { + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_free_records; + int ins_tree_depth; +}; + /* * How many free extents have we got before we need more meta data? */ @@ -242,6 +410,28 @@ bail: } /* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + le32_to_cpu(el->l_recs[i].e_clusters); +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -268,6 +458,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; + u32 new_cpos; mlog_entry_void(); @@ -302,6 +493,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -330,7 +524,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); - eb_el->l_recs[0].e_cpos = fe->i_clusters; + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); eb_el->l_recs[0].e_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) @@ -376,7 +574,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); - el->l_recs[i].e_cpos = fe->i_clusters; + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); el->l_recs[i].e_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); @@ -425,6 +623,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct buffer_head **ret_new_eb_bh) { int status, i; + u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -480,11 +679,13 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_clusters = fe->i_clusters; + fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters); for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { fe_el->l_recs[i].e_cpos = 0; fe_el->l_recs[i].e_clusters = 0; @@ -515,199 +716,6 @@ bail: } /* - * Expects the tree to already have room in the rightmost leaf for the - * extent. Updates all the extent blocks (and the dinode) on the way - * down. - */ -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters) -{ - int status, i, num_bhs = 0; - u64 next_blkno; - u16 next_free; - struct buffer_head **eb_bhs = NULL; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - - mlog_entry_void(); - - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; - if (el->l_tree_depth) { - /* This is another operation where we want to be - * careful about our tree updates. An error here means - * none of the previous changes we made should roll - * forward. As a result, we have to record the buffers - * for this part of the tree in an array and reserve a - * journal write to them before making any changes. */ - num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); - eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!eb_bhs) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - i = 0; - while(el->l_tree_depth) { - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); - - BUG_ON(i >= num_bhs); - status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); - status = -EIO; - goto bail; - } - - status = ocfs2_journal_access(handle, inode, eb_bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - el = &eb->h_list; - i++; - /* When we leave this loop, eb_bhs[num_bhs - 1] will - * hold the bottom-most leaf extent block. */ - } - BUG_ON(el->l_tree_depth); - - el = &fe->id2.i_list; - /* If we have tree depth, then the fe update is - * trivial, and we want to switch el out for the - * bottom-most leaf in order to update it with the - * actual extent data below. */ - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - /* (num_bhs - 1) to avoid the leaf */ - for(i = 0; i < (num_bhs - 1); i++) { - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - el = &eb->h_list; - - /* finally, make our actual change to the - * intermediate extent blocks. */ - next_free = le16_to_cpu(el->l_next_free_rec); - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - - status = ocfs2_journal_dirty(handle, eb_bhs[i]); - if (status < 0) - mlog_errno(status); - } - BUG_ON(i != (num_bhs - 1)); - /* note that the leaf block wasn't touched in - * the loop above */ - eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; - el = &eb->h_list; - BUG_ON(el->l_tree_depth); - } - - /* yay, we can finally add the actual extent now! */ - i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) && - ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { - le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); - } else if (le16_to_cpu(el->l_next_free_rec) && - (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { - /* having an empty extent at eof is legal. */ - if (el->l_recs[i].e_cpos != fe->i_clusters) { - ocfs2_error(inode->i_sb, - "Dinode %llu trailing extent is bad: " - "cpos (%u) != number of clusters (%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(fe->i_clusters)); - status = -EIO; - goto bail; - } - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - } else { - /* No contiguous record, or no empty record at eof, so - * we add a new one. */ - - BUG_ON(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count)); - i = le16_to_cpu(el->l_next_free_rec); - - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - el->l_recs[i].e_cpos = fe->i_clusters; - le16_add_cpu(&el->l_next_free_rec, 1); - } - - /* - * extent_map errors are not fatal, so they are ignored outside - * of flushing the thing. - */ - status = ocfs2_extent_map_append(inode, &el->l_recs[i], - new_clusters); - if (status) { - mlog_errno(status); - ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); - if (status < 0) - mlog_errno(status); - } - - status = 0; -bail: - if (eb_bhs) { - for (i = 0; i < num_bhs; i++) - if (eb_bhs[i]) - brelse(eb_bhs[i]); - kfree(eb_bhs); - } - - mlog_exit(status); - return status; -} - -/* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three @@ -807,53 +815,1523 @@ bail: return status; } -/* the caller needs to update fe->i_clusters */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters, - struct ocfs2_alloc_context *meta_ac) +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) { - int status, i, shift; - struct buffer_head *last_eb_bh = NULL; + return !rec->e_clusters; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + if (el->l_next_free_rec == el->l_count && !has_empty) + BUG(); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", + insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +static int ocfs2_find_subtree_root(struct inode *inode, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Inode %lu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + inode->i_ino, left->p_tree_depth, + right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; struct buffer_head *bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent list at " + "depth %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; - mlog(0, "add %u clusters starting at block %llu to inode %llu\n", - new_clusters, (unsigned long long)start_blk, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } - if (el->l_tree_depth) { - /* jump to end of tree */ - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_exit(status); - goto bail; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + brelse(bh); + bh = NULL; + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, + &bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out; + } + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(inode, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +static int ocfs2_find_leaf(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust e_clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + */ +static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + mlog(0, "Adjust records at index %u\n", i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path-&g |