diff options
Diffstat (limited to 'fs/btrfs/ctree.c')
| -rw-r--r-- | fs/btrfs/ctree.c | 834 | 
1 files changed, 542 insertions, 292 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 61b5bcd57b7..aeab453b8e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *src_buf);  static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,  		    int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,  				 struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);  struct btrfs_path *btrfs_alloc_path(void)  { @@ -225,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)  static void add_root_to_dirty_list(struct btrfs_root *root)  {  	spin_lock(&root->fs_info->trans_lock); -	if (root->track_dirty && list_empty(&root->dirty_list)) { +	if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && +	    list_empty(&root->dirty_list)) {  		list_add(&root->dirty_list,  			 &root->fs_info->dirty_cowonly_roots);  	} @@ -247,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	int level;  	struct btrfs_disk_key disk_key; -	WARN_ON(root->ref_cows && trans->transid != -		root->fs_info->running_transaction->transid); -	WARN_ON(root->ref_cows && trans->transid != root->last_trans); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->fs_info->running_transaction->transid); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->last_trans);  	level = btrfs_header_level(buf);  	if (level == 0) @@ -274,7 +275,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,  	else  		btrfs_set_header_owner(cow, new_root_objectid); -	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), +	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),  			    BTRFS_FSID_SIZE);  	WARN_ON(btrfs_header_generation(buf) > trans->transid); @@ -355,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)  }  /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. + * Pull a new tree mod seq number for our operation.   */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ -	u64 seq = atomic64_read(&fs_info->tree_mod_seq); -	seq &= 0xffffffff00000000ull; -	seq += 1ull << 32; -	atomic64_set(&fs_info->tree_mod_seq, seq); -	return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. - */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)  {  	return atomic64_inc_return(&fs_info->tree_mod_seq);  }  /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ -	return (seq & 0xffffffff00000000ull) - 1ull; -} - -/*   * This adds a new blocker to the tree mod log's blocker list if the @elem   * passed does not already have a sequence number set. So when a caller expects   * to record tree modifications, it should ensure to set elem->seq to zero @@ -403,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)  u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,  			   struct seq_list *elem)  { -	u64 seq; -  	tree_mod_log_write_lock(fs_info);  	spin_lock(&fs_info->tree_mod_seq_lock);  	if (!elem->seq) { -		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); +		elem->seq = btrfs_inc_tree_mod_seq(fs_info);  		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);  	} -	seq = btrfs_inc_tree_mod_seq_minor(fs_info);  	spin_unlock(&fs_info->tree_mod_seq_lock);  	tree_mod_log_write_unlock(fs_info); -	return seq; +	return elem->seq;  }  void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -475,6 +443,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,   * the index is the shifted logical of the *new* root node for root replace   * operations, or the shifted logical of the affected block for all other   * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock).   */  static noinline int  __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -483,27 +453,10 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)  	struct rb_node **new;  	struct rb_node *parent = NULL;  	struct tree_mod_elem *cur; -	int ret = 0;  	BUG_ON(!tm); -	tree_mod_log_write_lock(fs_info); -	if (list_empty(&fs_info->tree_mod_seq_list)) { -		tree_mod_log_write_unlock(fs_info); -		/* -		 * Ok we no longer care about logging modifications, free up tm -		 * and return 0.  Any callers shouldn't be using tm after -		 * calling tree_mod_log_insert, but if they do we can just -		 * change this to return a special error code to let the callers -		 * do their own thing. -		 */ -		kfree(tm); -		return 0; -	} - -	spin_lock(&fs_info->tree_mod_seq_lock); -	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); -	spin_unlock(&fs_info->tree_mod_seq_lock); +	tm->seq = btrfs_inc_tree_mod_seq(fs_info);  	tm_root = &fs_info->tree_mod_log;  	new = &tm_root->rb_node; @@ -518,18 +471,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)  			new = &((*new)->rb_left);  		else if (cur->seq > tm->seq)  			new = &((*new)->rb_right); -		else { -			ret = -EEXIST; -			kfree(tm); -			goto out; -		} +		else +			return -EEXIST;  	}  	rb_link_node(&tm->node, parent, new);  	rb_insert_color(&tm->node, tm_root); -out: -	tree_mod_log_write_unlock(fs_info); -	return ret; +	return 0;  }  /* @@ -545,19 +493,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,  		return 1;  	if (eb && btrfs_header_level(eb) == 0)  		return 1; + +	tree_mod_log_write_lock(fs_info); +	if (list_empty(&(fs_info)->tree_mod_seq_list)) { +		tree_mod_log_write_unlock(fs_info); +		return 1; +	} +  	return 0;  } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, -			  struct extent_buffer *eb, int slot, -			  enum mod_log_op op, gfp_t flags) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, +				    struct extent_buffer *eb) +{ +	smp_mb(); +	if (list_empty(&(fs_info)->tree_mod_seq_list)) +		return 0; +	if (eb && btrfs_header_level(eb) == 0) +		return 0; + +	return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, +		    enum mod_log_op op, gfp_t flags)  {  	struct tree_mod_elem *tm;  	tm = kzalloc(sizeof(*tm), flags);  	if (!tm) -		return -ENOMEM; +		return NULL;  	tm->index = eb->start >> PAGE_CACHE_SHIFT;  	if (op != MOD_LOG_KEY_ADD) { @@ -567,8 +534,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,  	tm->op = op;  	tm->slot = slot;  	tm->generation = btrfs_node_ptr_generation(eb, slot); +	RB_CLEAR_NODE(&tm->node); -	return __tree_mod_log_insert(fs_info, tm); +	return tm;  }  static noinline int @@ -576,10 +544,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,  			struct extent_buffer *eb, int slot,  			enum mod_log_op op, gfp_t flags)  { -	if (tree_mod_dont_log(fs_info, eb)) +	struct tree_mod_elem *tm; +	int ret; + +	if (!tree_mod_need_log(fs_info, eb))  		return 0; -	return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); +	tm = alloc_tree_mod_elem(eb, slot, op, flags); +	if (!tm) +		return -ENOMEM; + +	if (tree_mod_dont_log(fs_info, eb)) { +		kfree(tm); +		return 0; +	} + +	ret = __tree_mod_log_insert(fs_info, tm); +	tree_mod_log_write_unlock(fs_info); +	if (ret) +		kfree(tm); + +	return ret;  }  static noinline int @@ -587,53 +572,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,  			 struct extent_buffer *eb, int dst_slot, int src_slot,  			 int nr_items, gfp_t flags)  { -	struct tree_mod_elem *tm; -	int ret; +	struct tree_mod_elem *tm = NULL; +	struct tree_mod_elem **tm_list = NULL; +	int ret = 0;  	int i; +	int locked = 0; -	if (tree_mod_dont_log(fs_info, eb)) +	if (!tree_mod_need_log(fs_info, eb))  		return 0; +	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); +	if (!tm_list) +		return -ENOMEM; + +	tm = kzalloc(sizeof(*tm), flags); +	if (!tm) { +		ret = -ENOMEM; +		goto free_tms; +	} + +	tm->index = eb->start >> PAGE_CACHE_SHIFT; +	tm->slot = src_slot; +	tm->move.dst_slot = dst_slot; +	tm->move.nr_items = nr_items; +	tm->op = MOD_LOG_MOVE_KEYS; + +	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { +		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, +		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); +		if (!tm_list[i]) { +			ret = -ENOMEM; +			goto free_tms; +		} +	} + +	if (tree_mod_dont_log(fs_info, eb)) +		goto free_tms; +	locked = 1; +  	/*  	 * When we override something during the move, we log these removals.  	 * This can only happen when we move towards the beginning of the  	 * buffer, i.e. dst_slot < src_slot.  	 */  	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { -		ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, -				MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); -		BUG_ON(ret < 0); +		ret = __tree_mod_log_insert(fs_info, tm_list[i]); +		if (ret) +			goto free_tms;  	} -	tm = kzalloc(sizeof(*tm), flags); -	if (!tm) -		return -ENOMEM; +	ret = __tree_mod_log_insert(fs_info, tm); +	if (ret) +		goto free_tms; +	tree_mod_log_write_unlock(fs_info); +	kfree(tm_list); -	tm->index = eb->start >> PAGE_CACHE_SHIFT; -	tm->slot = src_slot; -	tm->move.dst_slot = dst_slot; -	tm->move.nr_items = nr_items; -	tm->op = MOD_LOG_MOVE_KEYS; +	return 0; +free_tms: +	for (i = 0; i < nr_items; i++) { +		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) +			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); +		kfree(tm_list[i]); +	} +	if (locked) +		tree_mod_log_write_unlock(fs_info); +	kfree(tm_list); +	kfree(tm); -	return __tree_mod_log_insert(fs_info, tm); +	return ret;  } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +		       struct tree_mod_elem **tm_list, +		       int nritems)  { -	int i; -	u32 nritems; +	int i, j;  	int ret; -	if (btrfs_header_level(eb) == 0) -		return; - -	nritems = btrfs_header_nritems(eb);  	for (i = nritems - 1; i >= 0; i--) { -		ret = __tree_mod_log_insert_key(fs_info, eb, i, -				MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); -		BUG_ON(ret < 0); +		ret = __tree_mod_log_insert(fs_info, tm_list[i]); +		if (ret) { +			for (j = nritems - 1; j > i; j--) +				rb_erase(&tm_list[j]->node, +					 &fs_info->tree_mod_log); +			return ret; +		}  	} + +	return 0;  }  static noinline int @@ -642,17 +669,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  			 struct extent_buffer *new_root, gfp_t flags,  			 int log_removal)  { -	struct tree_mod_elem *tm; +	struct tree_mod_elem *tm = NULL; +	struct tree_mod_elem **tm_list = NULL; +	int nritems = 0; +	int ret = 0; +	int i; -	if (tree_mod_dont_log(fs_info, NULL)) +	if (!tree_mod_need_log(fs_info, NULL))  		return 0; -	if (log_removal) -		__tree_mod_log_free_eb(fs_info, old_root); +	if (log_removal && btrfs_header_level(old_root) > 0) { +		nritems = btrfs_header_nritems(old_root); +		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), +				  flags); +		if (!tm_list) { +			ret = -ENOMEM; +			goto free_tms; +		} +		for (i = 0; i < nritems; i++) { +			tm_list[i] = alloc_tree_mod_elem(old_root, i, +			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); +			if (!tm_list[i]) { +				ret = -ENOMEM; +				goto free_tms; +			} +		} +	}  	tm = kzalloc(sizeof(*tm), flags); -	if (!tm) -		return -ENOMEM; +	if (!tm) { +		ret = -ENOMEM; +		goto free_tms; +	}  	tm->index = new_root->start >> PAGE_CACHE_SHIFT;  	tm->old_root.logical = old_root->start; @@ -660,7 +708,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,  	tm->generation = btrfs_header_generation(old_root);  	tm->op = MOD_LOG_ROOT_REPLACE; -	return __tree_mod_log_insert(fs_info, tm); +	if (tree_mod_dont_log(fs_info, NULL)) +		goto free_tms; + +	if (tm_list) +		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); +	if (!ret) +		ret = __tree_mod_log_insert(fs_info, tm); + +	tree_mod_log_write_unlock(fs_info); +	if (ret) +		goto free_tms; +	kfree(tm_list); + +	return ret; + +free_tms: +	if (tm_list) { +		for (i = 0; i < nritems; i++) +			kfree(tm_list[i]); +		kfree(tm_list); +	} +	kfree(tm); + +	return ret;  }  static struct tree_mod_elem * @@ -729,31 +800,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)  	return __tree_mod_log_search(fs_info, start, min_seq, 0);  } -static noinline void +static noinline int  tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  		     struct extent_buffer *src, unsigned long dst_offset,  		     unsigned long src_offset, int nr_items)  { -	int ret; +	int ret = 0; +	struct tree_mod_elem **tm_list = NULL; +	struct tree_mod_elem **tm_list_add, **tm_list_rem;  	int i; +	int locked = 0; -	if (tree_mod_dont_log(fs_info, NULL)) -		return; +	if (!tree_mod_need_log(fs_info, NULL)) +		return 0;  	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) -		return; +		return 0; +	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), +			  GFP_NOFS); +	if (!tm_list) +		return -ENOMEM; + +	tm_list_add = tm_list; +	tm_list_rem = tm_list + nr_items;  	for (i = 0; i < nr_items; i++) { -		ret = __tree_mod_log_insert_key(fs_info, src, -						i + src_offset, -						MOD_LOG_KEY_REMOVE, GFP_NOFS); -		BUG_ON(ret < 0); -		ret = __tree_mod_log_insert_key(fs_info, dst, -						     i + dst_offset, -						     MOD_LOG_KEY_ADD, -						     GFP_NOFS); -		BUG_ON(ret < 0); +		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, +		    MOD_LOG_KEY_REMOVE, GFP_NOFS); +		if (!tm_list_rem[i]) { +			ret = -ENOMEM; +			goto free_tms; +		} + +		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, +		    MOD_LOG_KEY_ADD, GFP_NOFS); +		if (!tm_list_add[i]) { +			ret = -ENOMEM; +			goto free_tms; +		} +	} + +	if (tree_mod_dont_log(fs_info, NULL)) +		goto free_tms; +	locked = 1; + +	for (i = 0; i < nr_items; i++) { +		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); +		if (ret) +			goto free_tms; +		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); +		if (ret) +			goto free_tms; +	} + +	tree_mod_log_write_unlock(fs_info); +	kfree(tm_list); + +	return 0; + +free_tms: +	for (i = 0; i < nr_items * 2; i++) { +		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) +			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); +		kfree(tm_list[i]);  	} +	if (locked) +		tree_mod_log_write_unlock(fs_info); +	kfree(tm_list); + +	return ret;  }  static inline void @@ -772,18 +887,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,  {  	int ret; -	ret = __tree_mod_log_insert_key(fs_info, eb, slot, +	ret = tree_mod_log_insert_key(fs_info, eb, slot,  					MOD_LOG_KEY_REPLACE,  					atomic ? GFP_ATOMIC : GFP_NOFS);  	BUG_ON(ret < 0);  } -static noinline void +static noinline int  tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)  { +	struct tree_mod_elem **tm_list = NULL; +	int nritems = 0; +	int i; +	int ret = 0; + +	if (btrfs_header_level(eb) == 0) +		return 0; + +	if (!tree_mod_need_log(fs_info, NULL)) +		return 0; + +	nritems = btrfs_header_nritems(eb); +	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), +			  GFP_NOFS); +	if (!tm_list) +		return -ENOMEM; + +	for (i = 0; i < nritems; i++) { +		tm_list[i] = alloc_tree_mod_elem(eb, i, +		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); +		if (!tm_list[i]) { +			ret = -ENOMEM; +			goto free_tms; +		} +	} +  	if (tree_mod_dont_log(fs_info, eb)) -		return; -	__tree_mod_log_free_eb(fs_info, eb); +		goto free_tms; + +	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); +	tree_mod_log_write_unlock(fs_info); +	if (ret) +		goto free_tms; +	kfree(tm_list); + +	return 0; + +free_tms: +	for (i = 0; i < nritems; i++) +		kfree(tm_list[i]); +	kfree(tm_list); + +	return ret;  }  static noinline void @@ -809,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,  	 * snapshot and the block was not allocated by tree relocation,  	 * we know the block is not shared.  	 */ -	if (root->ref_cows && +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&  	    buf != root->node && buf != root->commit_root &&  	    (btrfs_header_generation(buf) <=  	     btrfs_root_last_snapshot(&root->root_item) ||  	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))  		return 1;  #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -	if (root->ref_cows && +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&  	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)  		return 1;  #endif @@ -958,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	btrfs_assert_tree_locked(buf); -	WARN_ON(root->ref_cows && trans->transid != -		root->fs_info->running_transaction->transid); -	WARN_ON(root->ref_cows && trans->transid != root->last_trans); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->fs_info->running_transaction->transid); +	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && +		trans->transid != root->last_trans);  	level = btrfs_header_level(buf); @@ -996,7 +1152,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  	else  		btrfs_set_header_owner(cow, root->root_key.objectid); -	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), +	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),  			    BTRFS_FSID_SIZE);  	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1005,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  		return ret;  	} -	if (root->ref_cows) { +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {  		ret = btrfs_reloc_cow_block(trans, root, buf, cow);  		if (ret)  			return ret; @@ -1041,8 +1197,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,  		btrfs_set_node_ptr_generation(parent, parent_slot,  					      trans->transid);  		btrfs_mark_buffer_dirty(parent); -		if (last_ref) -			tree_mod_log_free_eb(root->fs_info, buf); +		if (last_ref) { +			ret = tree_mod_log_free_eb(root->fs_info, buf); +			if (ret) { +				btrfs_abort_transaction(trans, root, ret); +				return ret; +			} +		}  		btrfs_free_tree_block(trans, root, buf, parent_start,  				      last_ref);  	} @@ -1285,11 +1446,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)  		free_extent_buffer(eb_root);  		blocksize = btrfs_level_size(root, old_root->level);  		old = read_tree_block(root, logical, blocksize, 0); -		if (!old || !extent_buffer_uptodate(old)) { +		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {  			free_extent_buffer(old); -			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", -				logical); -			WARN_ON(1); +			btrfs_warn(root->fs_info, +				"failed to read tree block %llu from get_old_root", logical);  		} else {  			eb = btrfs_clone_extent_buffer(old);  			free_extent_buffer(old); @@ -1346,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct extent_buffer *buf)  { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	/* ensure we can see the force_cow */  	smp_rmb(); @@ -1364,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,  	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&  	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&  	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && -	    !root->force_cow) +	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))  		return 0;  	return 1;  } @@ -2463,6 +2627,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,  	return 0;  } +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, +		u64 iobjectid, u64 ioff, u8 key_type, +		struct btrfs_key *found_key) +{ +	int ret; +	struct btrfs_key key; +	struct extent_buffer *eb; +	struct btrfs_path *path; + +	key.type = key_type; +	key.objectid = iobjectid; +	key.offset = ioff; + +	if (found_path == NULL) { +		path = btrfs_alloc_path(); +		if (!path) +			return -ENOMEM; +	} else +		path = found_path; + +	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); +	if ((ret < 0) || (found_key == NULL)) { +		if (path != found_path) +			btrfs_free_path(path); +		return ret; +	} + +	eb = path->nodes[0]; +	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { +		ret = btrfs_next_leaf(fs_root, path); +		if (ret) +			return ret; +		eb = path->nodes[0]; +	} + +	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); +	if (found_key->type != key.type || +			found_key->objectid != key.objectid) +		return 1; + +	return 0; +} +  /*   * look for key in the tree.  path is filled in with nodes along the way   * if key is found, we return zero and you can find the item in the leaf @@ -2496,6 +2703,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  	lowest_level = p->lowest_level;  	WARN_ON(lowest_level && ins_len > 0);  	WARN_ON(p->nodes[0] != NULL); +	BUG_ON(!cow && ins_len);  	if (ins_len < 0) {  		lowest_unlock = 2; @@ -2533,9 +2741,13 @@ again:  		 * the commit roots are read only  		 * so we always do read locks  		 */ +		if (p->need_commit_sem) +			down_read(&root->fs_info->commit_root_sem);  		b = root->commit_root;  		extent_buffer_get(b);  		level = btrfs_header_level(b); +		if (p->need_commit_sem) +			up_read(&root->fs_info->commit_root_sem);  		if (!p->skip_locking)  			btrfs_tree_read_lock(b);  	} else { @@ -2604,8 +2816,6 @@ again:  			}  		}  cow_done: -		BUG_ON(!cow && ins_len); -  		p->nodes[level] = b;  		btrfs_clear_path_blocking(p, NULL, 0); @@ -2615,13 +2825,19 @@ cow_done:  		 * It is safe to drop the lock on our parent before we  		 * go through the expensive btree search on b.  		 * -		 * If cow is true, then we might be changing slot zero, -		 * which may require changing the parent.  So, we can't -		 * drop the lock until after we know which slot we're -		 * operating on. +		 * If we're inserting or deleting (ins_len != 0), then we might +		 * be changing slot zero, which may require changing the parent. +		 * So, we can't drop the lock until after we know which slot +		 * we're operating on.  		 */ -		if (!cow) -			btrfs_unlock_up_safe(p, level + 1); +		if (!ins_len && !p->keep_locks) { +			int u = level + 1; + +			if (u < BTRFS_MAX_LEVEL && p->locks[u]) { +				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); +				p->locks[u] = 0; +			} +		}  		ret = key_search(b, key, level, &prev_cmp, &slot); @@ -2649,7 +2865,7 @@ cow_done:  			 * which means we must have a write lock  			 * on the parent  			 */ -			if (slot == 0 && cow && +			if (slot == 0 && ins_len &&  			    write_lock_level < level + 1) {  				write_lock_level = level + 1;  				btrfs_release_path(p); @@ -2758,7 +2974,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,  	int level;  	int lowest_unlock = 1;  	u8 lowest_level = 0; -	int prev_cmp; +	int prev_cmp = -1;  	lowest_level = p->lowest_level;  	WARN_ON(p->nodes[0] != NULL); @@ -2769,7 +2985,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,  	}  again: -	prev_cmp = -1;  	b = get_old_root(root, time_seq);  	level = btrfs_header_level(b);  	p->locks[level] = BTRFS_READ_LOCK; @@ -2787,6 +3002,11 @@ again:  		 */  		btrfs_unlock_up_safe(p, level + 1); +		/* +		 * Since we can unwind eb's we want to do a real search every +		 * time. +		 */ +		prev_cmp = -1;  		ret = key_search(b, key, level, &prev_cmp, &slot);  		if (level != 0) { @@ -2898,7 +3118,9 @@ again:  			if (ret < 0)  				return ret;  			if (!ret) { -				p->slots[0] = btrfs_header_nritems(leaf) - 1; +				leaf = p->nodes[0]; +				if (p->slots[0] == btrfs_header_nritems(leaf)) +					p->slots[0]--;  				return 0;  			}  			if (!return_any) @@ -3019,8 +3241,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,  	} else  		push_items = min(src_nritems - 8, push_items); -	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, -			     push_items); +	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, +				   push_items); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		return ret; +	}  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(dst_nritems),  			   btrfs_node_key_ptr_offset(0), @@ -3090,8 +3316,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  				      (dst_nritems) *  				      sizeof(struct btrfs_key_ptr)); -	tree_mod_log_eb_copy(root->fs_info, dst, src, 0, -			     src_nritems - push_items, push_items); +	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, +				   src_nritems - push_items, push_items); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		return ret; +	}  	copy_extent_buffer(dst, src,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3148,7 +3378,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,  	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);  	btrfs_set_header_owner(c, root->root_key.objectid); -	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c), +	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),  			    BTRFS_FSID_SIZE);  	write_extent_buffer(c, root->fs_info->chunk_tree_uuid, @@ -3287,12 +3517,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans,  	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);  	btrfs_set_header_owner(split, root->root_key.objectid);  	write_extent_buffer(split, root->fs_info->fsid, -			    btrfs_header_fsid(split), BTRFS_FSID_SIZE); +			    btrfs_header_fsid(), BTRFS_FSID_SIZE);  	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,  			    btrfs_header_chunk_tree_uuid(split),  			    BTRFS_UUID_SIZE); -	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); +	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, +				   mid, c_nritems - mid); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		return ret; +	}  	copy_extent_buffer(split, c,  			   btrfs_node_key_ptr_offset(0),  			   btrfs_node_key_ptr_offset(mid), @@ -3337,8 +3572,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)  	if (!nr)  		return 0;  	btrfs_init_map_token(&token); -	start_item = btrfs_item_nr(l, start); -	end_item = btrfs_item_nr(l, end); +	start_item = btrfs_item_nr(start); +	end_item = btrfs_item_nr(end);  	data_len = btrfs_token_item_offset(l, start_item, &token) +  		btrfs_token_item_size(l, start_item, &token);  	data_len = data_len - btrfs_token_item_offset(l, end_item, &token); @@ -3359,8 +3594,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,  	int ret;  	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);  	if (ret < 0) { -		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " -		       "used %d nritems %d\n", +		btrfs_crit(root->fs_info, +			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",  		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),  		       leaf_space_used(leaf, 0, nritems), nritems);  	} @@ -3406,7 +3641,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  	slot = path->slots[1];  	i = left_nritems - 1;  	while (i >= nr) { -		item = btrfs_item_nr(left, i); +		item = btrfs_item_nr(i);  		if (!empty && push_items > 0) {  			if (path->slots[0] > i) @@ -3470,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  	btrfs_set_header_nritems(right, right_nritems);  	push_space = BTRFS_LEAF_DATA_SIZE(root);  	for (i = 0; i < right_nritems; i++) { -		item = btrfs_item_nr(right, i); +		item = btrfs_item_nr(i);  		push_space -= btrfs_token_item_size(right, item, &token);  		btrfs_set_token_item_offset(right, item, push_space, &token);  	} @@ -3568,6 +3803,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root  	if (left_nritems == 0)  		goto out_unlock; +	if (path->slots[0] == left_nritems && !empty) { +		/* Key greater than all keys in the leaf, right neighbor has +		 * enough room for it and we're not emptying our leaf to delete +		 * it, therefore use right neighbor to insert the new item and +		 * no need to touch/dirty our left leaft. */ +		btrfs_tree_unlock(left); +		free_extent_buffer(left); +		path->nodes[0] = right; +		path->slots[0] = 0; +		path->slots[1]++; +		return 0; +	} +  	return __push_leaf_right(trans, root, path, min_data_size, empty,  				right, free_space, left_nritems, min_slot);  out_unlock: @@ -3612,7 +3860,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  		nr = min(right_nritems - 1, max_slot);  	for (i = 0; i < nr; i++) { -		item = btrfs_item_nr(right, i); +		item = btrfs_item_nr(i);  		if (!empty && push_items > 0) {  			if (path->slots[0] < i) @@ -3639,8 +3887,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  		ret = 1;  		goto out;  	} -	if (!empty && push_items == btrfs_header_nritems(right)) -		WARN_ON(1); +	WARN_ON(!empty && push_items == btrfs_header_nritems(right));  	/* push data from right to left */  	copy_extent_buffer(left, right, @@ -3663,7 +3910,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {  		u32 ioff; -		item = btrfs_item_nr(left, i); +		item = btrfs_item_nr(i);  		ioff = btrfs_token_item_offset(left, item, &token);  		btrfs_set_token_item_offset(left, item, @@ -3694,7 +3941,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  	btrfs_set_header_nritems(right, right_nritems);  	push_space = BTRFS_LEAF_DATA_SIZE(root);  	for (i = 0; i < right_nritems; i++) { -		item = btrfs_item_nr(right, i); +		item = btrfs_item_nr(i);  		push_space = push_space - btrfs_token_item_size(right,  								item, &token); @@ -3835,7 +4082,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,  		      btrfs_item_end_nr(l, mid);  	for (i = 0; i < nritems; i++) { -		struct btrfs_item *item = btrfs_item_nr(right, i); +		struct btrfs_item *item = btrfs_item_nr(i);  		u32 ioff;  		ioff = btrfs_token_item_offset(right, item, &token); @@ -3885,14 +4132,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,  	int progress = 0;  	int slot;  	u32 nritems; +	int space_needed = data_size;  	slot = path->slots[0]; +	if (slot < btrfs_header_nritems(path->nodes[0])) +		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);  	/*  	 * try to push all the items after our slot into the  	 * right leaf  	 */ -	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); +	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);  	if (ret < 0)  		return ret; @@ -3912,7 +4162,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,  	/* try to push all the items before our slot into the next leaf */  	slot = path->slots[0]; -	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); +	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);  	if (ret < 0)  		return ret; @@ -3956,13 +4206,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,  	/* first try to make some room by pushing left and right */  	if (data_size && path->nodes[1]) { -		wret = push_leaf_right(trans, root, path, data_size, -				       data_size, 0, 0); +		int space_needed = data_size; + +		if (slot < btrfs_header_nritems(l)) +			space_needed -= btrfs_leaf_free_space(root, l); + +		wret = push_leaf_right(trans, root, path, space_needed, +				       space_needed, 0, 0);  		if (wret < 0)  			return wret;  		if (wret) { -			wret = push_leaf_left(trans, root, path, data_size, -					      data_size, 0, (u32)-1); +			wret = push_leaf_left(trans, root, path, space_needed, +					      space_needed, 0, (u32)-1);  			if (wret < 0)  				return wret;  		} @@ -4016,7 +4271,7 @@ again:  				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {  					if (data_size && !tried_avoid_double)  						goto push_for_double; -					split = 2 ; +					split = 2;  				}  			}  		} @@ -4042,7 +4297,7 @@ again:  	btrfs_set_header_owner(right, root->root_key.objectid);  	btrfs_set_header_level(right, 0);  	write_extent_buffer(right, root->fs_info->fsid, -			    btrfs_header_fsid(right), BTRFS_FSID_SIZE); +			    btrfs_header_fsid(), BTRFS_FSID_SIZE);  	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,  			    btrfs_header_chunk_tree_uuid(right), @@ -4177,7 +4432,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,  	btrfs_set_path_blocking(path); -	item = btrfs_item_nr(leaf, path->slots[0]); +	item = btrfs_item_nr(path->slots[0]);  	orig_offset = btrfs_item_offset(leaf, item);  	item_size = btrfs_item_size(leaf, item); @@ -4200,7 +4455,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,  	btrfs_cpu_key_to_disk(&disk_key, new_key);  	btrfs_set_item_key(leaf, &disk_key, slot); -	new_item = btrfs_item_nr(leaf, slot); +	new_item = btrfs_item_nr(slot);  	btrfs_set_item_offset(leaf, new_item, orig_offset);  	btrfs_set_item_size(leaf, new_item, item_size - split_offset); @@ -4339,7 +4594,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,  	/* first correct the data pointers */  	for (i = slot; i < nritems; i++) {  		u32 ioff; -		item = btrfs_item_nr(leaf, i); +		item = btrfs_item_nr(i);  		ioff = btrfs_token_item_offset(leaf, item, &token);  		btrfs_set_token_item_offset(leaf, item, @@ -4387,7 +4642,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,  			fixup_low_keys(root, path, &disk_key, 1);  	} -	item = btrfs_item_nr(leaf, slot); +	item = btrfs_item_nr(slot);  	btrfs_set_item_size(leaf, item, new_size);  	btrfs_mark_buffer_dirty(leaf); @@ -4430,7 +4685,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,  	BUG_ON(slot < 0);  	if (slot >= nritems) {  		btrfs_print_leaf(root, leaf); -		printk(KERN_CRIT "slot %d too large, nritems %d\n", +		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",  		       slot, nritems);  		BUG_ON(1);  	} @@ -4441,7 +4696,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,  	/* first correct the data pointers */  	for (i = slot; i < nritems; i++) {  		u32 ioff; -		item = btrfs_item_nr(leaf, i); +		item = btrfs_item_nr(i);  		ioff = btrfs_token_item_offset(leaf, item, &token);  		btrfs_set_token_item_offset(leaf, item, @@ -4455,7 +4710,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,  	data_end = old_data;  	old_size = btrfs_item_size_nr(leaf, slot); -	item = btrfs_item_nr(leaf, slot); +	item = btrfs_item_nr(slot);  	btrfs_set_item_size(leaf, item, old_size + data_size);  	btrfs_mark_buffer_dirty(leaf); @@ -4493,7 +4748,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,  	if (btrfs_leaf_free_space(root, leaf) < total_size) {  		btrfs_print_leaf(root, leaf); -		printk(KERN_CRIT "not enough freespace need %u have %d\n", +		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",  		       total_size, btrfs_leaf_free_space(root, leaf));  		BUG();  	} @@ -4503,7 +4758,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,  		if (old_data < data_end) {  			btrfs_print_leaf(root, leaf); -			printk(KERN_CRIT "slot %d old_data %d data_end %d\n", +			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",  			       slot, old_data, data_end);  			BUG_ON(1);  		} @@ -4514,7 +4769,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,  		for (i = slot; i < nritems; i++) {  			u32 ioff; -			item = btrfs_item_nr(leaf, i); +			item = btrfs_item_nr( i);  			ioff = btrfs_token_item_offset(leaf, item, &token);  			btrfs_set_token_item_offset(leaf, item,  						    ioff - total_data, &token); @@ -4535,7 +4790,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,  	for (i = 0; i < nr; i++) {  		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);  		btrfs_set_item_key(leaf, &disk_key, slot + i); -		item = btrfs_item_nr(leaf, slot + i); +		item = btrfs_item_nr(slot + i);  		btrfs_set_token_item_offset(leaf, item,  					    data_end - data_size[i], &token);  		data_end -= data_size[i]; @@ -4730,7 +4985,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		for (i = slot + nr; i < nritems; i++) {  			u32 ioff; -			item = btrfs_item_nr(leaf, i); +			item = btrfs_item_nr(i);  			ioff = btrfs_token_item_offset(leaf, item, &token);  			btrfs_set_token_item_offset(leaf, item,  						    ioff + dsize, &token); @@ -4815,7 +5070,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,   * This may release the path, and so you may lose any locks held at the   * time you call it.   */ -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  {  	struct btrfs_key key;  	struct btrfs_disk_key found_key; @@ -4823,14 +5078,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  	btrfs_item_key_to_cpu(path->nodes[0], &key, 0); -	if (key.offset > 0) +	if (key.offset > 0) {  		key.offset--; -	else if (key.type > 0) +	} else if (key.type > 0) {  		key.type--; -	else if (key.objectid > 0) +		key.offset = (u64)-1; +	} else if (key.objectid > 0) {  		key.objectid--; -	else +		key.type = (u8)-1; +		key.offset = (u64)-1; +	} else {  		return 1; +	}  	btrfs_release_path(path);  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -4838,7 +5097,17 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)  		return ret;  	btrfs_item_key(path->nodes[0], &found_key, 0);  	ret = comp_keys(&found_key, &key); -	if (ret < 0) +	/* +	 * We might have had an item with the previous key in the tree right +	 * before we released our path. And after we released our path, that +	 * item might have been pushed to the first slot (0) of the leaf we +	 * were holding due to a tree balance. Alternatively, an item with the +	 * previous key can exist as the only element of a leaf (big fat item). +	 * Therefore account for these 2 cases, so that our callers (like +	 * btrfs_previous_item) don't miss an existing item with a key matching +	 * the previous key we computed above. +	 */ +	if (ret <= 0)  		return 0;  	return 1;  } @@ -4866,7 +5135,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)   * was nothing in the tree that matched the search criteria.   */  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, -			 struct btrfs_key *max_key,  			 struct btrfs_path *path,  			 u64 min_trans)  { @@ -4911,10 +5179,8 @@ again:  		 * If it is too old, old, skip to the next one.  		 */  		while (slot < nritems) { -			u64 blockptr;  			u64 gen; -			blockptr = btrfs_node_blockptr(cur, slot);  			gen = btrfs_node_ptr_generation(cur, slot);  			if (gen < min_trans) {  				slot++; @@ -5080,7 +5346,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  {  	int ret;  	int cmp; -	struct btrfs_trans_handle *trans = NULL;  	struct btrfs_path *left_path = NULL;  	struct btrfs_path *right_path = NULL;  	struct btrfs_key left_key; @@ -5096,9 +5361,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	int advance_right;  	u64 left_blockptr;  	u64 right_blockptr; -	u64 left_start_ctransid; -	u64 right_start_ctransid; -	u64 ctransid; +	u64 left_gen; +	u64 right_gen;  	left_path = btrfs_alloc_path();  	if (!left_path) { @@ -5122,21 +5386,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	right_path->search_commit_root = 1;  	right_path->skip_locking = 1; -	spin_lock(&left_root->root_item_lock); -	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); -	spin_unlock(&left_root->root_item_lock); - -	spin_lock(&right_root->root_item_lock); -	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); -	spin_unlock(&right_root->root_item_lock); - -	trans = btrfs_join_transaction(left_root); -	if (IS_ERR(trans)) { -		ret = PTR_ERR(trans); -		trans = NULL; -		goto out; -	} -  	/*  	 * Strategy: Go to the first items of both trees. Then do  	 * @@ -5173,6 +5422,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	 *   the right if possible or go up and right.  	 */ +	down_read(&left_root->fs_info->commit_root_sem);  	left_level = btrfs_header_level(left_root->commit_root);  	left_root_level = left_level;  	left_path->nodes[left_level] = left_root->commit_root; @@ -5182,6 +5432,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	right_root_level = right_level;  	right_path->nodes[right_level] = right_root->commit_root;  	extent_buffer_get(right_path->nodes[right_level]); +	up_read(&left_root->fs_info->commit_root_sem);  	if (left_level == 0)  		btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5200,67 +5451,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	advance_left = advance_right = 0;  	while (1) { -		/* -		 * We need to make sure the transaction does not get committed -		 * while we do anything on commit roots. This means, we need to -		 * join and leave transactions for every item that we process. -		 */ -		if (trans && btrfs_should_end_transaction(trans, left_root)) { -			btrfs_release_path(left_path); -			btrfs_release_path(right_path); - -			ret = btrfs_end_transaction(trans, left_root); -			trans = NULL; -			if (ret < 0) -				goto out; -		} -		/* now rejoin the transaction */ -		if (!trans) { -			trans = btrfs_join_transaction(left_root); -			if (IS_ERR(trans)) { -				ret = PTR_ERR(trans); -				trans = NULL; -				goto out; -			} - -			spin_lock(&left_root->root_item_lock); -			ctransid = btrfs_root_ctransid(&left_root->root_item); -			spin_unlock(&left_root->root_item_lock); -			if (ctransid != left_start_ctransid) -				left_start_ctransid = 0; - -			spin_lock(&right_root->root_item_lock); -			ctransid = btrfs_root_ctransid(&right_root->root_item); -			spin_unlock(&right_root->root_item_lock); -			if (ctransid != right_start_ctransid) -				right_start_ctransid = 0; - -			if (!left_start_ctransid || !right_start_ctransid) { -				WARN(1, KERN_WARNING -					"btrfs: btrfs_compare_tree detected " -					"a change in one of the trees while " -					"iterating. This is probably a " -					"bug.\n"); -				ret = -EIO; -				goto out; -			} - -			/* -			 * the commit root may have changed, so start again -			 * where we stopped -			 */ -			left_path->lowest_level = left_level; -			right_path->lowest_level = right_level; -			ret = btrfs_search_slot(NULL, left_root, -					&left_key, left_path, 0, 0); -			if (ret < 0) -				goto out; -			ret = btrfs_search_slot(NULL, right_root, -					&right_key, right_path, 0, 0); -			if (ret < 0) -				goto out; -		} -  		if (advance_left && !left_end_reached) {  			ret = tree_advance(left_root, left_path, &left_level,  					left_root_level, @@ -5360,7 +5550,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  				right_blockptr = btrfs_node_blockptr(  						right_path->nodes[right_level],  						right_path->slots[right_level]); -				if (left_blockptr == right_blockptr) { +				left_gen = btrfs_node_ptr_generation( +						left_path->nodes[left_level], +						left_path->slots[left_level]); +				right_gen = btrfs_node_ptr_generation( +						right_path->nodes[right_level], +						right_path->slots[right_level]); +				if (left_blockptr == right_blockptr && +				    left_gen == right_gen) {  					/*  					 * As we're on a shared block, don't  					 * allow to go deeper. @@ -5383,14 +5580,6 @@ out:  	btrfs_free_path(left_path);  	btrfs_free_path(right_path);  	kfree(tmp_buf); - -	if (trans) { -		if (!ret) -			ret = btrfs_end_transaction(trans, left_root); -		else -			btrfs_end_transaction(trans, left_root); -	} -  	return ret;  } @@ -5529,6 +5718,24 @@ again:  		ret = 0;  		goto done;  	} +	/* +	 * So the above check misses one case: +	 * - after releasing the path above, someone has removed the item that +	 *   used to be at the very end of the block, and balance between leafs +	 *   gets another one with bigger key.offset to replace it. +	 * +	 * This one should be returned as well, or we can get leaf corruption +	 * later(esp. in __btrfs_drop_extents()). +	 * +	 * And a bit more explanation about this check, +	 * with ret > 0, the key isn't found, the path points to the slot +	 * where it should be inserted, so the path->slots[0] item must be the +	 * bigger one. +	 */ +	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { +		ret = 0; +		goto done; +	}  	while (level < BTRFS_MAX_LEVEL) {  		if (!path->nodes[level]) { @@ -5677,3 +5884,46 @@ int btrfs_previous_item(struct btrfs_root *root,  	}  	return 1;  } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, +			struct btrfs_path *path, u64 min_objectid) +{ +	struct btrfs_key found_key; +	struct extent_buffer *leaf; +	u32 nritems; +	int ret; + +	while (1) { +		if (path->slots[0] == 0) { +			btrfs_set_path_blocking(path); +			ret = btrfs_prev_leaf(root, path); +			if (ret != 0) +				return ret; +		} else { +			path->slots[0]--; +		} +		leaf = path->nodes[0]; +		nritems = btrfs_header_nritems(leaf); +		if (nritems == 0) +			return 1; +		if (path->slots[0] == nritems) +			path->slots[0]--; + +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); +		if (found_key.objectid < min_objectid) +			break; +		if (found_key.type == BTRFS_EXTENT_ITEM_KEY || +		    found_key.type == BTRFS_METADATA_ITEM_KEY) +			return 0; +		if (found_key.objectid == min_objectid && +		    found_key.type < BTRFS_EXTENT_ITEM_KEY) +			break; +	} +	return 1; +}  | 
