diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-06-18 10:48:04 -0700 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-07-10 17:32:00 -0700 |
commit | 328d5752e1259dfb29b7e65f6c2d145fddbaa750 (patch) | |
tree | 08198271a0382cafcc4c0de2fc1efcf35cb400af /fs/ocfs2 | |
parent | c3afcbb34426a9291e4c038540129053a72c3cd8 (diff) |
ocfs2: btree changes for unwritten extents
Writes to a region marked as unwritten might result in a record split or
merge. We can support splits by making minor changes to the existing insert
code. Merges require left rotations which mostly re-use right rotation
support functions.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 1799 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 6 | ||||
-rw-r--r-- | fs/ocfs2/endian.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.c | 31 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 13 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 7 |
6 files changed, 1770 insertions, 91 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a02d026cb0e..0db6a1f724e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -119,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path) } /* + * All the elements of src into dest. After this call, src could be freed + * without affecting dest. + * + * Both paths should have the same root. Any non-root elements of dest + * will be freed. + */ +static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_el(dest) != path_root_el(src)); + + ocfs2_reinit_path(dest, 1); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + if (dest->p_node[i].bh) + get_bh(dest->p_node[i].bh); + } +} + +/* * Make the *dest path the same as src and re-initialize src path to * have a root only. */ @@ -214,10 +239,41 @@ out: return ret; } +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); + + rec_end = rec_start + clusters; + + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } + } + + return ret; +} + enum ocfs2_contig_type { CONTIG_NONE = 0, CONTIG_LEFT, - CONTIG_RIGHT + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, }; @@ -255,6 +311,14 @@ static enum ocfs2_contig_type { u64 blkno = le64_to_cpu(insert_rec->e_blkno); + /* + * Refuse to coalesce extent records with different flag + * fields - we don't want to mix unwritten extents with user + * data. + */ + if (ext->e_flags != insert_rec->e_flags) + return CONTIG_NONE; + if (ocfs2_extents_adjacent(ext, insert_rec) && ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) return CONTIG_RIGHT; @@ -279,7 +343,14 @@ enum ocfs2_append_type { APPEND_TAIL, }; +enum ocfs2_split_type { + SPLIT_NONE = 0, + SPLIT_LEFT, + SPLIT_RIGHT, +}; + struct ocfs2_insert_type { + enum ocfs2_split_type ins_split; enum ocfs2_append_type ins_appending; enum ocfs2_contig_type ins_contig; int ins_contig_index; @@ -287,6 +358,13 @@ struct ocfs2_insert_type { int ins_tree_depth; }; +struct ocfs2_merge_ctxt { + enum ocfs2_contig_type c_contig_type; + int c_has_empty_extent; + int c_split_covers_rec; + int c_used_tail_recs; +}; + /* * How many free extents have we got before we need more meta data? */ @@ -457,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, + struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int status, new_blocks, i; @@ -472,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_entry_void(); - BUG_ON(!last_eb_bh); + BUG_ON(!last_eb_bh || !*last_eb_bh); fe = (struct ocfs2_dinode *) fe_bh->b_data; @@ -503,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -564,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access(handle, inode, last_eb_bh, + status = ocfs2_journal_access(handle, inode, *last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -597,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * next_leaf on the previously last-extent-block. */ fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); - status = ocfs2_journal_dirty(handle, last_eb_bh); + status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); status = ocfs2_journal_dirty(handle, fe_bh); @@ -612,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); } + /* + * Some callers want to track the rightmost leaf so pass it + * back here. + */ + brelse(*last_eb_bh); + get_bh(new_eb_bhs[0]); + *last_eb_bh = new_eb_bhs[0]; + status = 0; bail: if (new_eb_bhs) { @@ -831,10 +917,12 @@ bail: * be considered invalid. * * Tree depth after the grow is returned via *final_depth. + * + * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, int *final_depth, - struct buffer_head *last_eb_bh, + struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; @@ -869,10 +957,21 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, goto out; } depth++; - /* Special case: we have room now if we shifted from - * tree_depth 0 */ - if (depth == 1) + if (depth == 1) { + /* + * Special case: we have room now if we shifted from + * tree_depth 0, so no more work needs to be done. + * + * We won't be calling add_branch, so pass + * back *last_eb_bh as the new leaf. At depth + * zero, it should always be null so there's + * no reason to brelse. + */ + BUG_ON(*last_eb_bh); + get_bh(bh); + *last_eb_bh = bh; goto out; + } } /* call ocfs2_add_branch to add the final part of the tree with @@ -998,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, } +static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) +{ + int size, num_recs = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(num_recs == 0); + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + num_recs--; + size = num_recs * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[0], &el->l_recs[1], size); + memset(&el->l_recs[num_recs], 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(num_recs); + } +} + /* * Create an empty extent record . * @@ -1275,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); + left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); + } left_clusters -= le32_to_cpu(left_rec->e_cpos); left_rec->e_int_clusters = cpu_to_le32(left_clusters); @@ -1595,10 +1714,16 @@ out: return ret; } +/* + * Extend the transaction by enough credits to complete the rotation, + * and still leave at least the original number of credits allocated + * to this transaction. + */ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + int op_credits, struct ocfs2_path *path) { - int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; if (handle->h_buffer_credits < credits) return ocfs2_extend_trans(handle, credits); @@ -1632,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, return 0; } +static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + unsigned int range; + struct ocfs2_extent_rec *rec; + + if (next_free == 0) + return 0; + + rec = &el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + /* Empty list. */ + if (next_free == 1) + return 0; + rec = &el->l_recs[1]; + } + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + return 1; + return 0; +} + /* * Rotate all the records in a btree right one record, starting at insert_cpos. * @@ -1650,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, */ static int ocfs2_rotate_tree_right(struct inode *inode, handle_t *handle, + enum ocfs2_split_type split, u32 insert_cpos, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start; + int ret, start, orig_credits = handle->h_buffer_credits; u32 cpos; struct ocfs2_path *left_path = NULL; @@ -1721,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode, (unsigned long long) path_leaf_bh(left_path)->b_blocknr); - if (ocfs2_rotate_requires_path_adjustment(left_path, + if (split == SPLIT_NONE && + ocfs2_rotate_requires_path_adjustment(left_path, insert_cpos)) { - mlog(0, "Path adjustment required\n"); /* * We've rotated the tree as much as we @@ -1751,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, right_path->p_tree_depth); ret = ocfs2_extend_rotate_transaction(handle, start, - right_path); + orig_credits, right_path); if (ret) { mlog_errno(ret); goto out; @@ -1764,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } + if (split != SPLIT_NONE && + ocfs2_leftmost_rec_contains(path_leaf_el(right_path), + insert_cpos)) { + /* + * A rotate moves the rightmost left leaf + * record over to the leftmost right leaf + * slot. If we're doing an extent split + * instead of a real insert, then we have to + * check that the extent to be split wasn't + * just moved over. If it was, then we can + * exit here, passing left_path back - + * ocfs2_split_extent() is smart enough to + * search both leaves. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + /* * There is no need to re-read the next right path * as we know that it'll be our current left @@ -1786,6 +1953,1031 @@ out_ret_path: return ret; } +static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, idx; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + u32 range; + + /* Path should always be rightmost. */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + BUG_ON(eb->h_next_leaf_blk != 0ULL); + + el = &eb->h_list; + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + for (i = 0; i < path->p_tree_depth; i++) { + el = path->p_node[i].el; + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + + rec->e_int_clusters = cpu_to_le32(range); + le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); + + ocfs2_journal_dirty(handle, path->p_node[i].bh); + } +} + +static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path *path, int unlink_start) +{ + int ret, i; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh; + + for(i = unlink_start; i < path_num_items(path); i++) { + bh = path->p_node[i].bh; + + eb = (struct ocfs2_extent_block *)bh->b_data; + /* + * Not all nodes might have had their final count + * decremented by the caller - handle this here. + */ + el = &eb->h_list; + if (le16_to_cpu(el->l_next_free_rec) > 1) { + mlog(ML_ERROR, + "Inode %llu, attempted to remove extent block " + "%llu with %u records\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(el->l_next_free_rec)); + + ocfs2_journal_dirty(handle, bh); + ocfs2_remove_from_cache(inode, bh); + continue; + } + + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_journal_dirty(handle, bh); + + ret = ocfs2_cache_extent_block_free(dealloc, eb); + if (ret) + mlog_errno(ret); + + ocfs2_remove_from_cache(inode, bh); + } +} + +static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int i; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + + el = path_leaf_el(left_path); + + eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; + + for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + if (root_el->l_recs[i].e_blkno == eb->h_blkno) + break; + + BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); + + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&root_el->l_next_free_rec, -1); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + eb->h_next_leaf_blk = 0; + + ocfs2_journal_dirty(handle, root_bh); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + ocfs2_unlink_path(inode, handle, dealloc, right_path, + subtree_index + 1); +} + +static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int *deleted) +{ + int ret, i, del_right_subtree = 0, right_has_empty = 0; + struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; + struct ocfs2_extent_block *eb; + + *deleted = 0; + + right_leaf_el = path_leaf_el(right_path); + left_leaf_el = path_leaf_el(left_path); + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) + return 0; + + eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; + if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { + /* + * It's legal for us to proceed if the right leaf is + * the rightmost one and it has an empty extent. There + * are two cases to handle - whether the leaf will be + * empty after removal or not. If the leaf isn't empty + * then just remove the empty extent up front. The + * next block will handle empty leaves by flagging + * them for unlink. + * + * Non rightmost leaves will throw -EAGAIN and the + * caller can manually move the subtree and retry. + */ + + if (eb->h_next_leaf_blk != 0ULL) + return -EAGAIN; + + if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { + ret = ocfs2_journal_access(handle, inode, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(right_leaf_el); + } else + right_has_empty = 1; + } + + if (eb->h_next_leaf_blk == 0ULL && + le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { + /* + * We have to update i_last_eb_blk during the meta + * data delete. + */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + del_right_subtree = 1; + } + + /* + * Getting here with an empty extent in the right path implies + * that it's the rightmost path and will be deleted. + */ + BUG_ON(right_has_empty && !del_right_subtree); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!right_has_empty) { + /* + * Only do this if we're moving a real + * record. Otherwise, the action is delayed until + * after removal of the right path in which case we + * can do a simple shift to remove the empty extent. + */ + ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); + memset(&right_leaf_el->l_recs[0], 0, + sizeof(struct ocfs2_extent_rec)); + } + if (eb->h_next_leaf_blk == 0ULL) { + /* + * Move recs over to get rid of empty extent, decrease + * next_free. This is allowed to remove the last + * extent in our leaf (setting l_next_free_rec to + * zero) - the delete code below won't care. + */ + ocfs2_remove_empty_extent(right_leaf_el); + } + + ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + if (ret) + mlog_errno(ret); + ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + if (ret) + mlog_errno(ret); + + if (del_right_subtree) { + ocfs2_unlink_subtree(inode, handle, left_path, right_path, + subtree_index, dealloc); + ocfs2_update_edge_lengths(inode, handle, left_path); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + di->i_last_eb_blk = eb->h_blkno; + + /* + * Removal of the extent in the left leaf was skipped + * above so we could delete the right path + * 1st. + */ + if (right_has_empty) + ocfs2_remove_empty_extent(left_leaf_el); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + + *deleted = 1; + } else + ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the right of the current one. + * + * Will return zero if the path passed in is already the rightmost path. + * + * This looks similar, but is subtly different to + * ocfs2_find_cpos_for_left_leaf(). + */ +static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + *cpos = 0; + + if (path->p_tree_depth == 0) + return 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + int next_free; + + el = path->p_node[i].el; + + /* + * Find the extent record just after the one in our + * path. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == (next_free - 1)) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the rightmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The rightmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, + handle_t *handle, + struct buffer_head *bh, + struct ocfs2_extent_list *el) +{ + int ret; + + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(el); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int __ocfs2_rotate_tree_left(struct inode *inode, + handle_t *handle, int orig_credits, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path **empty_extent_path) +{ + int ret, subtree_root, deleted; + u32 right_cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_path *right_path = NULL; + + BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + + *empty_extent_path = NULL; + + ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_cp_path(left_path, path); + + right_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (right_cpos) { + ret = ocfs2_find_path(inode, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(inode, left_path, + right_path); + + mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", + subtree_root, + (unsigned long long) + right_path->p_node[subtree_root].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + orig_credits, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_left(inode, handle, left_path, + right_path, subtree_root, + dealloc, &deleted); + if (ret == -EAGAIN) { + /* + * The rotation has to temporarily stop due to + * the right subtree having an empty + * extent. Pass it back to the caller for a + * fixup. + */ + *empty_extent_path = right_path; + right_path = NULL; + goto out; + } + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The subtree rotate might have removed records on + * the rightmost edge. If so, then rotation is + * complete. + */ + if (deleted) + break; + + ocfs2_mv_path(left_path, right_path); + + ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(right_path); + ocfs2_free_path(left_path); + + return ret; +} + +static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, subtree_index; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + /* + * XXX: This code assumes that the root is an inode, which is + * true for now but may change as tree code gets generic. + */ + di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto out; + } + + /* + * There's two ways we handle this depending on + * whether path is the only existing one. + */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos) { + /* + * We have a path to the left of this one - it needs + * an update too. + */ + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + + ocfs2_unlink_subtree(inode, handle, left_path, path, + subtree_index, dealloc); + ocfs2_update_edge_lengths(inode, handle, left_path); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + di->i_last_eb_blk = eb->h_blkno; + } else { + /* + * 'path' is also the leftmost path which + * means it must be the only one. This gets + * handled differently because we want to + * revert the inode back to having extents + * in-line. + */ + ocfs2_unlink_path(inode, handle, dealloc, path, 1); + + el = &di->id2.i_list; + el->l_tree_depth = 0; + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + di->i_last_eb_blk = 0; + } + + ocfs2_journal_dirty(handle, path_root_bh(path)); + +out: + ocfs2_free_path(left_path); + return ret; +} + +/* + * Left rotation of btree records. + * + * In many ways, this is (unsurprisingly) the opposite of right + * rotation. We start at some non-rightmost path containing an empty + * extent in the leaf block. The code works its way to the rightmost + * path by rotating records to the left in every subtree. + * + * This is used by any code which reduces the number of extent records + * in a leaf. After removal, an empty record should be placed in the + * leftmost list position. + * + * This won't handle a length update of the rightmost path records if + * the rightmost tree leaf record is removed so the caller is + * responsible for detecting and correcting that. + */ +static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, orig_credits = handle->h_buffer_credits; + struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + el = path_leaf_el(path); + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + if (path->p_tree_depth == 0) { +rightmost_no_delete: + /* + * In-inode extents. This is trivially handled, so do + * it up front. + */ + ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, + path_leaf_bh(path), + path_leaf_el(path)); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Handle rightmost branch now. There's several cases: + * 1) simple rotation leaving records in there. That's trivial. + * 2) rotation requiring a branch delete - there's no more + * records left. Two cases of this: + * a) There are branches to the left. + * b) This is also the leftmost (the only) branch. + * + * 1) is handled via ocfs2_rotate_rightmost_leaf_left() + * 2a) we need the left branch so that we can update it with the unlink + * 2b) we need to bring the inode back to inline extents. + */ + + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + el = &eb->h_list; + if (eb->h_next_leaf_blk == 0) { + /* + * This gets a bit tricky if we're going to delete the + * rightmost path. Get the other cases out of the way + * 1st. + */ + if (le16_to_cpu(el->l_next_free_rec) > 1) + goto rightmost_no_delete; + + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto out; + } + + /* + * XXX: The caller can not trust "path" any more after + * this as it will have been deleted. What do we do? + * + * In theory the rotate-for-merge code will never get + * here because it'll always ask for a rotate in a + * nonempty list. + */ + + ret = ocfs2_remove_rightmost_path(inode, handle, path, + dealloc); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Now we can loop, remembering the path we get from -EAGAIN + * and restarting from there. + */ +try_rotate: + ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, + dealloc, &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + while (ret == -EAGAIN) { + tmp_path = restart_path; + restart_path = NULL; + + ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, + tmp_path, dealloc, + &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + ocfs2_free_path(tmp_path); + tmp_path = NULL; + + if (ret == 0) + goto try_rotate; + } + +out: + ocfs2_free_path(tmp_path); + ocfs2_free_path(restart_path); + return ret; +} + +static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, + int index) +{ + struct ocfs2_extent_rec *rec = &el->l_recs[index]; + unsigned int size; + + if (rec->e_leaf_clusters == 0) { + /* + * We consumed all of the merged-from record. An empty + * extent cannot exist anywhere but the 1st array + * position, so move things over if the merged-from + * record doesn't occupy that position. + * + * This creates a new empty extent so the caller + * should be smart enough to have removed any existing + * ones. + */ + if (index > 0) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + size = index * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[1], &el->l_recs[0], size); + } + + /* + * Always memset - the caller doesn't check whether it + * created an empty extent, so there could be junk in + * the other fields. + */ + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + } +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the beginning of the record at index + 1. + */ +static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, + handle_t *handle, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_extent_list *el, int index) +{ + int ret; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + + BUG_ON(ind |