aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 10:52:55 -0700
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 10:52:55 -0700
commitadd096909da63ef32d6766f6771c07c9f16c6ee5 (patch)
tree58594bcf68cbb6f777d5270d098ab8ca69cbaee3 /fs/ocfs2
parente245befce7af0a1e1347079ed62695b059594bd4 (diff)
parent54c57dc3b6578356c0a428c767d4bf080254a2ee (diff)
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits) [PATCH] ocfs2: zero_user_page conversion ocfs2: Support xfs style space reservation ioctls ocfs2: support for removing file regions ocfs2: update truncate handling of partial clusters ocfs2: btree support for removal of arbirtrary extents ocfs2: Support creation of unwritten extents ocfs2: support writing of unwritten extents ocfs2: small cleanup of ocfs2_write_begin_nolock() ocfs2: btree changes for unwritten extents ocfs2: abstract btree growing calls ocfs2: use all extent block suballocators ocfs2: plug truncate into cached dealloc routines ocfs2: simplify deallocation locking ocfs2: harden buffer check during mapping of page blocks ocfs2: shared writeable mmap ocfs2: factor out write aops into nolock variants ocfs2: rework ocfs2_buffered_write_cluster() ocfs2: take ip_alloc_sem during entire truncate ocfs2: Add "preferred slot" mount option [KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c ...
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1015
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c702
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c10
-rw-r--r--fs/ocfs2/ioctl.c15
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c167
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c27
-rw-r--r--fs/ocfs2/super.h2
31 files changed, 4231 insertions, 980 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145..f5e11f4fa95 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
#include "buffer_head_io.h"
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ struct ocfs2_extent_block *eb);
/*
* Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
}
/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+ int i;
+
+ BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_el(dest) != path_root_el(src));
+
+ ocfs2_reinit_path(dest, 1);
+
+ for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+ dest->p_node[i].bh = src->p_node[i].bh;
+ dest->p_node[i].el = src->p_node[i].el;
+
+ if (dest->p_node[i].bh)
+ get_bh(dest->p_node[i].bh);
+ }
+}
+
+/*
* Make the *dest path the same as src and re-initialize src path to
* have a root only.
*/
@@ -212,10 +239,41 @@ out:
return ret;
}
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+ int ret = -1;
+ int i;
+ struct ocfs2_extent_rec *rec;
+ u32 rec_end, rec_start, clusters;
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ rec_start = le32_to_cpu(rec->e_cpos);
+ clusters = ocfs2_rec_clusters(el, rec);
+
+ rec_end = rec_start + clusters;
+
+ if (v_cluster >= rec_start && v_cluster < rec_end) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
enum ocfs2_contig_type {
CONTIG_NONE = 0,
CONTIG_LEFT,
- CONTIG_RIGHT
+ CONTIG_RIGHT,
+ CONTIG_LEFTRIGHT,
};
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+ /*
+ * Refuse to coalesce extent records with different flag
+ * fields - we don't want to mix unwritten extents with user
+ * data.
+ */
+ if (ext->e_flags != insert_rec->e_flags)
+ return CONTIG_NONE;
+
if (ocfs2_extents_adjacent(ext, insert_rec) &&
ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
APPEND_TAIL,
};
+enum ocfs2_split_type {
+ SPLIT_NONE = 0,
+ SPLIT_LEFT,
+ SPLIT_RIGHT,
+};
+
struct ocfs2_insert_type {
+ enum ocfs2_split_type ins_split;
enum ocfs2_append_type ins_appending;
enum ocfs2_contig_type ins_contig;
int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
int ins_tree_depth;
};
+struct ocfs2_merge_ctxt {
+ enum ocfs2_contig_type c_contig_type;
+ int c_has_empty_extent;
+ int c_split_covers_rec;
+ int c_used_tail_recs;
+};
+
/*
* How many free extents have we got before we need more meta data?
*/
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
eb->h_blkno = cpu_to_le64(first_blkno);
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
- /* we always use slot zero's suballocator */
- eb->h_suballoc_slot = 0;
-#else
eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct buffer_head *eb_bh,
- struct buffer_head *last_eb_bh,
+ struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_entry_void();
- BUG_ON(!last_eb_bh);
+ BUG_ON(!last_eb_bh || !*last_eb_bh);
fe = (struct ocfs2_dinode *) fe_bh->b_data;
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
- eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+ eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
+ status = ocfs2_journal_access(handle, inode, *last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* next_leaf on the previously last-extent-block. */
fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, last_eb_bh);
+ status = ocfs2_journal_dirty(handle, *last_eb_bh);
if (status < 0)
mlog_errno(status);
status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_errno(status);
}
+ /*
+ * Some callers want to track the rightmost leaf so pass it
+ * back here.
+ */
+ brelse(*last_eb_bh);
+ get_bh(new_eb_bhs[0]);
+ *last_eb_bh = new_eb_bhs[0];
+
status = 0;
bail:
if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
}
/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
+ struct buffer_head *di_bh, int *final_depth,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret, shift;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *bh = NULL;
+
+ BUG_ON(meta_ac == NULL);
+
+ shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+ if (shift < 0) {
+ ret = shift;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* We traveled all the way to the bottom of the allocation tree
+ * and didn't find room for any more extents - we need to add
+ * another tree level */
+ if (shift) {
+ BUG_ON(bh);
+ mlog(0, "need to shift tree depth (current = %d)\n", depth);
+
+ /* ocfs2_shift_tree_depth will return us a buffer with
+ * the new extent block (so we can pass that to
+ * ocfs2_add_branch). */
+ ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+ meta_ac, &bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ depth++;
+ if (depth == 1) {
+ /*
+ * Special case: we have room now if we shifted from
+ * tree_depth 0, so no more work needs to be done.
+ *
+ * We won't be calling add_branch, so pass
+ * back *last_eb_bh as the new leaf. At depth
+ * zero, it should always be null so there's
+ * no reason to brelse.
+ */
+ BUG_ON(*last_eb_bh);
+ get_bh(bh);
+ *last_eb_bh = bh;
+ goto out;
+ }
+ }
+
+ /* call ocfs2_add_branch to add the final part of the tree with
+ * the new data. */
+ mlog(0, "add branch. bh = %p\n", bh);
+ ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+ meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (final_depth)
+ *final_depth = depth;
+ brelse(bh);
+ return ret;
+}
+
+/*
* This is only valid for leaf nodes, which are the only ones that can
* have empty extents anyway.
*/
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
}
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+ int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+ BUG_ON(num_recs == 0);
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ num_recs--;
+ size = num_recs * sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[0], &el->l_recs[1], size);
+ memset(&el->l_recs[num_recs], 0,
+ sizeof(struct ocfs2_extent_rec));
+ el->l_next_free_rec = cpu_to_le16(num_recs);
+ }
+}
+
/*
* Create an empty extent record .
*
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
* immediately to their right.
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+ if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+ BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+ left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+ }
left_clusters -= le32_to_cpu(left_rec->e_cpos);
left_rec->e_int_clusters = cpu_to_le32(left_clusters);
@@ -1531,10 +1714,16 @@ out:
return ret;
}
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+ int op_credits,
struct ocfs2_path *path)
{
- int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+ int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
if (handle->h_buffer_credits < credits)
return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
return 0;
}
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ if (next_free == 0)
+ return 0;
+
+ rec = &el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ /* Empty list. */
+ if (next_free == 1)
+ return 0;
+ rec = &el->l_recs[1];
+ }
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+ return 1;
+ return 0;
+}
+
/*
* Rotate all the records in a btree right one record, starting at insert_cpos.
*
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
*/
static int ocfs2_rotate_tree_right(struct inode *inode,
handle_t *handle,
+ enum ocfs2_split_type split,
u32 insert_cpos,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, start;
+ int ret, start, orig_credits = handle->h_buffer_credits;
u32 cpos;
struct ocfs2_path *left_path = NULL;
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
(unsigned long long)
path_leaf_bh(left_path)->b_blocknr);
- if (ocfs2_rotate_requires_path_adjustment(left_path,
+ if (split == SPLIT_NONE &&
+ ocfs2_rotate_requires_path_adjustment(left_path,
insert_cpos)) {
- mlog(0, "Path adjustment required\n");
/*
* We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
right_path->p_tree_depth);
ret = ocfs2_extend_rotate_transaction(handle, start,
- right_path);
+ orig_credits, right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
+ if (split != SPLIT_NONE &&
+ ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+ insert_cpos)) {
+ /*
+ * A rotate moves the rightmost left leaf
+ * record over to the leftmost right leaf
+ * slot. If we're doing an extent split
+ * instead of a real insert, then we have to
+ * check that the extent to be split wasn't
+ * just moved over. If it was, then we can
+ * exit here, passing left_path back -
+ * ocfs2_split_extent() is smart enough to
+ * search both leaves.
+ */
+ *ret_left_path = left_path;
+ goto out_ret_path;
+ }
+
/*
* There is no need to re-read the next right path
* as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
return ret;
}
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path)
+{
+ int i, idx;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+ u32 range;
+
+ /* Path should always be rightmost. */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ BUG_ON(eb->h_next_leaf_blk != 0ULL);
+
+ el = &eb->h_list;
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ for (i = 0; i < path->p_tree_depth; i++) {
+ el = path->p_node[i].el;
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+
+ rec->e_int_clusters = cpu_to_le32(range);
+ le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+
+ ocfs2_journal_dirty(handle, path->p_node[i].bh);
+ }
+}
+
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path *path, int unlink_start)
+{
+ int ret, i;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct buffer_head *bh;
+
+ for(i = unlink_start; i < path_num_items(path); i++) {
+ bh = path->p_node[i].bh;
+
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ /*
+ * Not all nodes might have had their final count
+ * decremented by the caller - handle this here.
+ */
+ el = &eb->h_list;
+ if (le16_to_cpu(el->l_next_free_rec) > 1) {
+ mlog(ML_ERROR,
+ "Inode %llu, attempted to remove extent block "
+ "%llu with %u records\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_remove_from_cache(inode, bh);
+ continue;
+ }
+
+ el->l_next_free_rec = 0;
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+
+ ret = ocfs2_cache_extent_block_free(dealloc, eb);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_remove_from_cache(inode, bh);
+ }
+}
+
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int i;
+ struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+ struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+
+ el = path_leaf_el(left_path);
+
+ eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+ for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+ break;
+
+ BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ eb->h_next_leaf_blk = 0;
+
+ ocfs2_journal_dirty(handle, root_bh);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+ ocfs2_unlink_path(inode, handle, dealloc, right_path,
+ subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int *deleted)
+{
+ int ret, i, del_right_subtree = 0, right_has_empty = 0;
+ struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+ struct ocfs2_extent_block *eb;
+
+ *deleted = 0;
+
+ right_leaf_el = path_leaf_el(right_path);
+ left_leaf_el = path_leaf_el(left_path);
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+ return 0;
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+ if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+ /*
+ * It's legal for us to proceed if the right leaf is
+ * the rightmost one and it has an empty extent. There
+ * are two cases to handle - whether the leaf will be
+ * empty after removal or not. If the leaf isn't empty
+ * then just remove the empty extent up front. The
+ * next block will handle empty leaves by flagging
+ * them for unlink.
+ *
+ * Non rightmost leaves will throw -EAGAIN and the
+ * caller can manually move the subtree and retry.
+ */
+
+ if (eb->h_next_leaf_blk != 0ULL)
+ return -EAGAIN;
+
+ if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+ ret = ocfs2_journal_access(handle, inode,
+ path_leaf_bh(right_path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(right_leaf_el);
+ } else
+ right_has_empty = 1;
+ }
+
+ if (eb->h_next_leaf_blk == 0ULL &&
+ le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+ /*
+ * We have to update i_last_eb_blk during the meta
+ * data delete.
+ */
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ del_right_subtree = 1;
+ }
+
+ /*
+ * Getting here with an empty extent in the right path implies
+ * that it's the rightmost path and will be deleted.
+ */
+ BUG_ON(right_has_empty && !del_right_subtree);
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+ ret = ocfs2_journal_access(handle, inode,
+ right_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode,
+ left_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!right_has_empty) {
+ /*
+ * Only do this if we're moving a real
+ * record. Otherwise, the action is delayed until
+ * after removal of the right path in which case we
+ * can do a simple shift to remove the empty extent.
+ */
+ ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+ memset(&right_leaf_el->l_recs[0], 0,
+ sizeof(struct ocfs2_extent_rec));
+ }
+ if (eb->h_next_leaf_blk == 0ULL) {
+ /*
+ * Move recs over to get rid of empty extent, decrease
+ * next_free. This is allowed to remove the last
+ * extent in our leaf (setting l_next_free_rec to
+ * zero) - the delete code below won't care.
+ */
+ ocfs2_remove_empty_extent(right_leaf_el);
+ }
+
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ if (ret)
+ mlog_errno(ret);
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+ if (ret)
+ mlog_errno(ret);
+
+ if (del_right_subtree) {
+ ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+ subtree_index, dealloc);
+ ocfs2_update_edge_lengths(inode, handle, left_path);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ di->i_last_eb_blk = eb->h_blkno;
+
+ /*
+ * Removal of the extent in the left leaf was skipped
+ * above so we could delete the right path
+ * 1st.
+ */
+ if (right_has_empty)
+ ocfs2_remove_empty_extent(left_leaf_el);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ *deleted = 1;
+ } else
+ ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+ subtree_index);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
+{
+ int i, j, ret = 0;
+ u64 blkno;
+ struct ocfs2_extent_list *el;
+
+ *cpos = 0;
+
+ if (path->p_tree_depth == 0)
+ return 0;
+
+ blkno = path_leaf_bh(path)->b_blocknr;
+
+ /* Start at the tree node just above the leaf and work our way up. */
+ i = path->p_tree_depth - 1;
+ while (i >= 0) {
+ int next_free;
+
+ el = path->p_node[i].el;
+
+ /*
+ * Find the extent record just after the one in our
+ * path.
+ */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+ if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+ if (j == (next_free - 1)) {
+ if (i == 0) {
+ /*
+ * We've determined that the
+ * path specified is already
+ * the rightmost one - return a
+ * cpos of zero.
+ */
+ goto out;
+ }
+ /*
+ * The rightmost record points to our
+ * leaf - we need to travel up the
+ * tree one level.
+ */
+ goto next_node;
+ }
+
+ *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+ goto out;
+ }
+ }
+
+ /*
+ * If we got here, we never found a valid node where
+ * the tree indicated one should be.
+ */
+ ocfs2_error(sb,
+ "Invalid extent tree at extent block %llu\n",
+ (unsigned long long)blkno);
+ ret = -EROFS;
+ goto out;
+
+next_node:
+ blkno = path->p_node[i].bh->b_blocknr;
+ i--;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *bh,
+ struct ocfs2_extent_list *el)
+{
+ int ret;
+
+ if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+ return 0;
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(el);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+ handle_t *handle, int orig_credits,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path **empty_extent_path)
+{
+ int ret, subtree_root, deleted;
+ u32 right_cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_path *right_path = NULL;
+
+ BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+ *empty_extent_path = NULL;
+
+ ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+ &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ left_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_cp_path(left_path, path);
+
+ right_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (right_cpos) {
+ ret = ocfs2_find_path(inode, right_path, right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(inode, left_path,
+ right_path);
+
+ mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+ subtree_root,
+ (unsigned long long)
+ right_path->p_node[subtree_root].bh->b_blocknr,
+ right_path->p_tree_depth);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ orig_credits, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+ right_path, subtree_root,
+ dealloc, &deleted);
+ if (ret == -EAGAIN) {
+ /*
+ * The rotation has to temporarily stop due to
+ * the right subtree having an empty
+ * extent. Pass it back to the caller for a
+ * fixup.
+ */
+ *empty_extent_path = right_path;
+ right_path = NULL;
+ goto out;
+ }
+ if (ret