diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 1475 | 
1 files changed, 961 insertions, 514 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d58bef130a4..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -25,17 +25,17 @@  #include <linux/slab.h>  #include <linux/ratelimit.h>  #include <linux/percpu_counter.h> -#include "compat.h"  #include "hash.h" -#include "ctree.h" +#include "tree-log.h"  #include "disk-io.h"  #include "print-tree.h" -#include "transaction.h"  #include "volumes.h"  #include "raid56.h"  #include "locking.h"  #include "free-space-cache.h"  #include "math.h" +#include "sysfs.h" +#include "qgroup.h"  #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extra_op); +				struct btrfs_delayed_extent_op *extra_op, +				int no_quota);  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,  				    struct extent_buffer *leaf,  				    struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins); +				     int level, struct btrfs_key *ins, +				     int no_quota);  static int do_chunk_alloc(struct btrfs_trans_handle *trans,  			  struct btrfs_root *extent_root, u64 flags,  			  int force); @@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  			    int dump_block_groups);  static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				       u64 num_bytes, int reserve); +				       u64 num_bytes, int reserve, +				       int delalloc);  static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,  			       u64 num_bytes);  int btrfs_pin_extent(struct btrfs_root *root, @@ -419,7 +422,7 @@ static noinline void caching_thread(struct btrfs_work *work)  again:  	mutex_lock(&caching_ctl->mutex);  	/* need to make sure the commit_root doesn't disappear */ -	down_read(&fs_info->extent_commit_sem); +	down_read(&fs_info->commit_root_sem);  next:  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -442,10 +445,11 @@ next:  			if (ret)  				break; -			if (need_resched()) { +			if (need_resched() || +			    rwsem_is_contended(&fs_info->commit_root_sem)) {  				caching_ctl->progress = last;  				btrfs_release_path(path); -				up_read(&fs_info->extent_commit_sem); +				up_read(&fs_info->commit_root_sem);  				mutex_unlock(&caching_ctl->mutex);  				cond_resched();  				goto again; @@ -512,7 +516,7 @@ next:  err:  	btrfs_free_path(path); -	up_read(&fs_info->extent_commit_sem); +	up_read(&fs_info->commit_root_sem);  	free_excluded_extents(extent_root, block_group); @@ -548,7 +552,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	caching_ctl->block_group = cache;  	caching_ctl->progress = cache->key.objectid;  	atomic_set(&caching_ctl->count, 1); -	caching_ctl->work.func = caching_thread; +	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);  	spin_lock(&cache->lock);  	/* @@ -632,14 +636,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  		return 0;  	} -	down_write(&fs_info->extent_commit_sem); +	down_write(&fs_info->commit_root_sem);  	atomic_inc(&caching_ctl->count);  	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); -	up_write(&fs_info->extent_commit_sem); +	up_write(&fs_info->commit_root_sem);  	btrfs_get_block_group(cache); -	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); +	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);  	return ret;  } @@ -768,20 +772,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  	if (!path)  		return -ENOMEM; -	if (metadata) { -		key.objectid = bytenr; -		key.type = BTRFS_METADATA_ITEM_KEY; -		key.offset = offset; -	} else { -		key.objectid = bytenr; -		key.type = BTRFS_EXTENT_ITEM_KEY; -		key.offset = offset; -	} -  	if (!trans) {  		path->skip_locking = 1;  		path->search_commit_root = 1;  	} + +search_again: +	key.objectid = bytenr; +	key.offset = offset; +	if (metadata) +		key.type = BTRFS_METADATA_ITEM_KEY; +	else +		key.type = BTRFS_EXTENT_ITEM_KEY; +  again:  	ret = btrfs_search_slot(trans, root->fs_info->extent_root,  				&key, path, 0, 0); @@ -789,7 +792,6 @@ again:  		goto out_free;  	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { -		metadata = 0;  		if (path->slots[0]) {  			path->slots[0]--;  			btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -856,14 +858,16 @@ again:  			mutex_lock(&head->mutex);  			mutex_unlock(&head->mutex);  			btrfs_put_delayed_ref(&head->node); -			goto again; +			goto search_again;  		} +		spin_lock(&head->lock);  		if (head->extent_op && head->extent_op->update_flags)  			extent_flags |= head->extent_op->flags_to_set;  		else  			BUG_ON(num_refs == 0);  		num_refs += head->node.ref_mod; +		spin_unlock(&head->lock);  		mutex_unlock(&head->mutex);  	}  	spin_unlock(&delayed_refs->lock); @@ -1073,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)  	__le64 lenum;  	lenum = cpu_to_le64(root_objectid); -	high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); +	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));  	lenum = cpu_to_le64(owner); -	low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); +	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));  	lenum = cpu_to_le64(offset); -	low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); +	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));  	return ((u64)high_crc << 31) ^ (u64)low_crc;  } @@ -1270,7 +1274,7 @@ fail:  static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  					   struct btrfs_root *root,  					   struct btrfs_path *path, -					   int refs_to_drop) +					   int refs_to_drop, int *last_ref)  {  	struct btrfs_key key;  	struct btrfs_extent_data_ref *ref1 = NULL; @@ -1306,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  	if (num_refs == 0) {  		ret = btrfs_del_item(trans, root, path); +		*last_ref = 1;  	} else {  		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1541,6 +1546,7 @@ again:  				ret = 0;  		}  		if (ret) { +			key.objectid = bytenr;  			key.type = BTRFS_EXTENT_ITEM_KEY;  			key.offset = num_bytes;  			btrfs_release_path(path); @@ -1551,9 +1557,8 @@ again:  	if (ret && !insert) {  		err = -ENOENT;  		goto out; -	} else if (ret) { +	} else if (WARN_ON(ret)) {  		err = -EIO; -		WARN_ON(1);  		goto out;  	} @@ -1763,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root,  				  struct btrfs_path *path,  				  struct btrfs_extent_inline_ref *iref,  				  int refs_to_mod, -				  struct btrfs_delayed_extent_op *extent_op) +				  struct btrfs_delayed_extent_op *extent_op, +				  int *last_ref)  {  	struct extent_buffer *leaf;  	struct btrfs_extent_item *ei; @@ -1807,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root,  		else  			btrfs_set_shared_data_ref_count(leaf, sref, refs);  	} else { +		*last_ref = 1;  		size =  btrfs_extent_inline_ref_size(type);  		item_size = btrfs_item_size_nr(leaf, path->slots[0]);  		ptr = (unsigned long)iref; @@ -1838,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,  	if (ret == 0) {  		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);  		update_inline_extent_backref(root, path, iref, -					     refs_to_add, extent_op); +					     refs_to_add, extent_op, NULL);  	} else if (ret == -ENOENT) {  		setup_inline_extent_backref(root, path, iref, parent,  					    root_objectid, owner, offset, @@ -1871,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct btrfs_path *path,  				 struct btrfs_extent_inline_ref *iref, -				 int refs_to_drop, int is_data) +				 int refs_to_drop, int is_data, int *last_ref)  {  	int ret = 0;  	BUG_ON(!is_data && refs_to_drop != 1);  	if (iref) {  		update_inline_extent_backref(root, path, iref, -					     -refs_to_drop, NULL); +					     -refs_to_drop, NULL, last_ref);  	} else if (is_data) { -		ret = remove_extent_data_ref(trans, root, path, refs_to_drop); +		ret = remove_extent_data_ref(trans, root, path, refs_to_drop, +					     last_ref);  	} else { +		*last_ref = 1;  		ret = btrfs_del_item(trans, root, path);  	}  	return ret; @@ -1945,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 u64 bytenr, u64 num_bytes, u64 parent, -			 u64 root_objectid, u64 owner, u64 offset, int for_cow) +			 u64 root_objectid, u64 owner, u64 offset, +			 int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -1957,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_ADD_DELAYED_REF, NULL, for_cow); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	} else {  		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, owner, offset, -					BTRFS_ADD_DELAYED_REF, NULL, for_cow); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	}  	return ret;  } @@ -1972,37 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  				  u64 bytenr, u64 num_bytes,  				  u64 parent, u64 root_objectid,  				  u64 owner, u64 offset, int refs_to_add, +				  int no_quota,  				  struct btrfs_delayed_extent_op *extent_op)  { +	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_extent_item *item; +	struct btrfs_key key;  	u64 refs;  	int ret; -	int err = 0; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; +	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) +		no_quota = 1; +  	path->reada = 1;  	path->leave_spinning = 1;  	/* this will setup the path even if it fails to insert the back ref */ -	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, -					   path, bytenr, num_bytes, parent, +	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, +					   bytenr, num_bytes, parent,  					   root_objectid, owner, offset,  					   refs_to_add, extent_op); -	if (ret == 0) +	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))  		goto out; +	/* +	 * Ok we were able to insert an inline extent and it appears to be a new +	 * reference, deal with the qgroup accounting. +	 */ +	if (!ret && !no_quota) { +		ASSERT(root->fs_info->quota_enabled); +		leaf = path->nodes[0]; +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		item = btrfs_item_ptr(leaf, path->slots[0], +				      struct btrfs_extent_item); +		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) +			type = BTRFS_QGROUP_OPER_ADD_SHARED; +		btrfs_release_path(path); -	if (ret != -EAGAIN) { -		err = ret; +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0);  		goto out;  	} +	/* +	 * Ok we had -EAGAIN which means we didn't have space to insert and +	 * inline extent ref, so just update the reference count and add a +	 * normal backref. +	 */  	leaf = path->nodes[0]; +	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	refs = btrfs_extent_refs(leaf, item); +	if (refs) +		type = BTRFS_QGROUP_OPER_ADD_SHARED;  	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);  	if (extent_op)  		__run_delayed_extent_op(extent_op, leaf, item); @@ -2010,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_release_path(path); +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0); +		if (ret) +			goto out; +	} +  	path->reada = 1;  	path->leave_spinning = 1; -  	/* now insert the actual backref */  	ret = insert_extent_backref(trans, root->fs_info->extent_root,  				    path, bytenr, parent, root_objectid, @@ -2021,7 +2064,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  		btrfs_abort_transaction(trans, root, ret);  out:  	btrfs_free_path(path); -	return err; +	return ret;  }  static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -2046,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  	if (node->type == BTRFS_SHARED_DATA_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root;  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {  		if (extent_op) @@ -2061,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  					     node->num_bytes, parent,  					     ref_root, ref->objectid,  					     ref->offset, node->ref_mod, -					     extent_op); +					     node->no_quota, extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent,  					  ref_root, ref->objectid,  					  ref->offset, node->ref_mod, -					  extent_op); +					  extent_op, node->no_quota);  	} else {  		BUG();  	} @@ -2137,15 +2179,28 @@ again:  	}  	if (ret > 0) {  		if (metadata) { -			btrfs_release_path(path); -			metadata = 0; +			if (path->slots[0] > 0) { +				path->slots[0]--; +				btrfs_item_key_to_cpu(path->nodes[0], &key, +						      path->slots[0]); +				if (key.objectid == node->bytenr && +				    key.type == BTRFS_EXTENT_ITEM_KEY && +				    key.offset == node->num_bytes) +					ret = 0; +			} +			if (ret > 0) { +				btrfs_release_path(path); +				metadata = 0; -			key.offset = node->num_bytes; -			key.type = BTRFS_EXTENT_ITEM_KEY; -			goto again; +				key.objectid = node->bytenr; +				key.offset = node->num_bytes; +				key.type = BTRFS_EXTENT_ITEM_KEY; +				goto again; +			} +		} else { +			err = -EIO; +			goto out;  		} -		err = -EIO; -		goto out;  	}  	leaf = path->nodes[0]; @@ -2191,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root;  	ins.objectid = node->bytenr;  	if (skinny_metadata) { @@ -2210,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,  						parent, ref_root,  						extent_op->flags_to_set,  						&extent_op->key, -						ref->level, &ins); +						ref->level, &ins, +						node->no_quota);  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {  		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,  					     node->num_bytes, parent, ref_root, -					     ref->level, 0, 1, extent_op); +					     ref->level, 0, 1, node->no_quota, +					     extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent, ref_root, -					  ref->level, 0, 1, extent_op); +					  ref->level, 0, 1, extent_op, +					  node->no_quota);  	} else {  		BUG();  	} @@ -2234,8 +2291,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  {  	int ret = 0; -	if (trans->aborted) +	if (trans->aborted) { +		if (insert_reserved) +			btrfs_pin_extent(root, node->bytenr, +					 node->num_bytes, 1);  		return 0; +	}  	if (btrfs_delayed_ref_is_head(node)) {  		struct btrfs_delayed_ref_head *head; @@ -2278,64 +2339,62 @@ static noinline struct btrfs_delayed_ref_node *  select_delayed_ref(struct btrfs_delayed_ref_head *head)  {  	struct rb_node *node; -	struct btrfs_delayed_ref_node *ref; -	int action = BTRFS_ADD_DELAYED_REF; -again: +	struct btrfs_delayed_ref_node *ref, *last = NULL;; +  	/*  	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.  	 * this prevents ref count from going down to zero when  	 * there still are pending delayed ref.  	 */ -	node = rb_prev(&head->node.rb_node); -	while (1) { -		if (!node) -			break; +	node = rb_first(&head->ref_root); +	while (node) {  		ref = rb_entry(node, struct btrfs_delayed_ref_node,  				rb_node); -		if (ref->bytenr != head->node.bytenr) -			break; -		if (ref->action == action) +		if (ref->action == BTRFS_ADD_DELAYED_REF)  			return ref; -		node = rb_prev(node); +		else if (last == NULL) +			last = ref; +		node = rb_next(node);  	} -	if (action == BTRFS_ADD_DELAYED_REF) { -		action = BTRFS_DROP_DELAYED_REF; -		goto again; -	} -	return NULL; +	return last;  }  /*   * Returns 0 on success or if called with an already aborted transaction.   * Returns -ENOMEM or -EIO on failure and will abort the transaction.   */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, -				       struct btrfs_root *root, -				       struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, +					     struct btrfs_root *root, +					     unsigned long nr)  {  	struct btrfs_delayed_ref_root *delayed_refs;  	struct btrfs_delayed_ref_node *ref;  	struct btrfs_delayed_ref_head *locked_ref = NULL;  	struct btrfs_delayed_extent_op *extent_op;  	struct btrfs_fs_info *fs_info = root->fs_info; +	ktime_t start = ktime_get();  	int ret; -	int count = 0; +	unsigned long count = 0; +	unsigned long actual_count = 0;  	int must_insert_reserved = 0;  	delayed_refs = &trans->transaction->delayed_refs;  	while (1) {  		if (!locked_ref) { -			/* pick a new head ref from the cluster list */ -			if (list_empty(cluster)) +			if (count >= nr)  				break; -			locked_ref = list_entry(cluster->next, -				     struct btrfs_delayed_ref_head, cluster); +			spin_lock(&delayed_refs->lock); +			locked_ref = btrfs_select_ref_head(trans); +			if (!locked_ref) { +				spin_unlock(&delayed_refs->lock); +				break; +			}  			/* grab the lock that says we are going to process  			 * all the refs for this head */  			ret = btrfs_delayed_ref_lock(trans, locked_ref); - +			spin_unlock(&delayed_refs->lock);  			/*  			 * we may have dropped the spin lock to get the head  			 * mutex lock, and that might have given someone else @@ -2356,6 +2415,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		 * finish.  If we merged anything we need to re-loop so we can  		 * get a good ref.  		 */ +		spin_lock(&locked_ref->lock);  		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,  					 locked_ref); @@ -2367,17 +2427,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		if (ref && ref->seq &&  		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { -			/* -			 * there are still refs with lower seq numbers in the -			 * process of being added. Don't run this ref yet. -			 */ -			list_del_init(&locked_ref->cluster); +			spin_unlock(&locked_ref->lock);  			btrfs_delayed_ref_unlock(locked_ref); -			locked_ref = NULL; +			spin_lock(&delayed_refs->lock); +			locked_ref->processing = 0;  			delayed_refs->num_heads_ready++;  			spin_unlock(&delayed_refs->lock); +			locked_ref = NULL;  			cond_resched(); -			spin_lock(&delayed_refs->lock); +			count++;  			continue;  		} @@ -2392,6 +2450,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		locked_ref->extent_op = NULL;  		if (!ref) { + +  			/* All delayed refs have been processed, Go ahead  			 * and send the head node to run_one_delayed_ref,  			 * so that any accounting fixes can happen @@ -2404,26 +2464,54 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  			}  			if (extent_op) { -				spin_unlock(&delayed_refs->lock); - +				spin_unlock(&locked_ref->lock);  				ret = run_delayed_extent_op(trans, root,  							    ref, extent_op);  				btrfs_free_delayed_extent_op(extent_op);  				if (ret) { +					/* +					 * Need to reset must_insert_reserved if +					 * there was an error so the abort stuff +					 * can cleanup the reserved space +					 * properly. +					 */ +					if (must_insert_reserved) +						locked_ref->must_insert_reserved = 1; +					locked_ref->processing = 0;  					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); -					spin_lock(&delayed_refs->lock);  					btrfs_delayed_ref_unlock(locked_ref);  					return ret;  				} +				continue; +			} -				goto next; +			/* +			 * Need to drop our head ref lock and re-aqcuire the +			 * delayed ref lock and then re-check to make sure +			 * nobody got added. +			 */ +			spin_unlock(&locked_ref->lock); +			spin_lock(&delayed_refs->lock); +			spin_lock(&locked_ref->lock); +			if (rb_first(&locked_ref->ref_root) || +			    locked_ref->extent_op) { +				spin_unlock(&locked_ref->lock); +				spin_unlock(&delayed_refs->lock); +				continue;  			} +			ref->in_tree = 0; +			delayed_refs->num_heads--; +			rb_erase(&locked_ref->href_node, +				 &delayed_refs->href_root); +			spin_unlock(&delayed_refs->lock); +		} else { +			actual_count++; +			ref->in_tree = 0; +			rb_erase(&ref->rb_node, &locked_ref->ref_root);  		} +		atomic_dec(&delayed_refs->num_entries); -		ref->in_tree = 0; -		rb_erase(&ref->rb_node, &delayed_refs->root); -		delayed_refs->num_entries--;  		if (!btrfs_delayed_ref_is_head(ref)) {  			/*  			 * when we play the delayed ref, also correct the @@ -2440,20 +2528,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  			default:  				WARN_ON(1);  			} -		} else { -			list_del_init(&locked_ref->cluster);  		} -		spin_unlock(&delayed_refs->lock); +		spin_unlock(&locked_ref->lock);  		ret = run_one_delayed_ref(trans, root, ref, extent_op,  					  must_insert_reserved);  		btrfs_free_delayed_extent_op(extent_op);  		if (ret) { +			locked_ref->processing = 0;  			btrfs_delayed_ref_unlock(locked_ref);  			btrfs_put_delayed_ref(ref);  			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); -			spin_lock(&delayed_refs->lock);  			return ret;  		} @@ -2469,11 +2555,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		}  		btrfs_put_delayed_ref(ref);  		count++; -next:  		cond_resched(); +	} + +	/* +	 * We don't want to include ref heads since we can have empty ref heads +	 * and those will drastically skew our runtime down since we just do +	 * accounting, no actual extent tree updates. +	 */ +	if (actual_count > 0) { +		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); +		u64 avg; + +		/* +		 * We weigh the current average higher than our current runtime +		 * to avoid large swings in the average. +		 */  		spin_lock(&delayed_refs->lock); +		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; +		avg = div64_u64(avg, 4); +		fs_info->avg_delayed_ref_runtime = avg; +		spin_unlock(&delayed_refs->lock);  	} -	return count; +	return 0;  }  #ifdef SCRAMBLE_DELAYED_REFS @@ -2519,52 +2623,6 @@ static u64 find_middle(struct rb_root *root)  }  #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, -					 struct btrfs_fs_info *fs_info) -{ -	struct qgroup_update *qgroup_update; -	int ret = 0; - -	if (list_empty(&trans->qgroup_ref_list) != -	    !trans->delayed_ref_elem.seq) { -		/* list without seq or seq without list */ -		btrfs_err(fs_info, -			"qgroup accounting update error, list is%s empty, seq is %#x.%x", -			list_empty(&trans->qgroup_ref_list) ? "" : " not", -			(u32)(trans->delayed_ref_elem.seq >> 32), -			(u32)trans->delayed_ref_elem.seq); -		BUG(); -	} - -	if (!trans->delayed_ref_elem.seq) -		return 0; - -	while (!list_empty(&trans->qgroup_ref_list)) { -		qgroup_update = list_first_entry(&trans->qgroup_ref_list, -						 struct qgroup_update, list); -		list_del(&qgroup_update->list); -		if (!ret) -			ret = btrfs_qgroup_account_ref( -					trans, fs_info, qgroup_update->node, -					qgroup_update->extent_op); -		kfree(qgroup_update); -	} - -	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - -	return ret; -} - -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, -		      int count) -{ -	int val = atomic_read(&delayed_refs->ref_seq); - -	if (val < seq || val >= seq + count) -		return 1; -	return 0; -} -  static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)  {  	u64 num_bytes; @@ -2581,7 +2639,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)  	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));  } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,  				       struct btrfs_root *root)  {  	struct btrfs_block_rsv *global_rsv; @@ -2610,6 +2668,101 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,  	return ret;  } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root) +{ +	struct btrfs_fs_info *fs_info = root->fs_info; +	u64 num_entries = +		atomic_read(&trans->transaction->delayed_refs.num_entries); +	u64 avg_runtime; +	u64 val; + +	smp_mb(); +	avg_runtime = fs_info->avg_delayed_ref_runtime; +	val = num_entries * avg_runtime; +	if (num_entries * avg_runtime >= NSEC_PER_SEC) +		return 1; +	if (val >= NSEC_PER_SEC / 2) +		return 2; + +	return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { +	struct btrfs_root *root; +	int count; +	int error; +	int sync; +	struct completion wait; +	struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ +	struct async_delayed_refs *async; +	struct btrfs_trans_handle *trans; +	int ret; + +	async = container_of(work, struct async_delayed_refs, work); + +	trans = btrfs_join_transaction(async->root); +	if (IS_ERR(trans)) { +		async->error = PTR_ERR(trans); +		goto done; +	} + +	/* +	 * trans->sync means that when we call end_transaciton, we won't +	 * wait on delayed refs +	 */ +	trans->sync = true; +	ret = btrfs_run_delayed_refs(trans, async->root, async->count); +	if (ret) +		async->error = ret; + +	ret = btrfs_end_transaction(trans, async->root); +	if (ret && !async->error) +		async->error = ret; +done: +	if (async->sync) +		complete(&async->wait); +	else +		kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, +				 unsigned long count, int wait) +{ +	struct async_delayed_refs *async; +	int ret; + +	async = kmalloc(sizeof(*async), GFP_NOFS); +	if (!async) +		return -ENOMEM; + +	async->root = root->fs_info->tree_root; +	async->count = count; +	async->error = 0; +	if (wait) +		async->sync = 1; +	else +		async->sync = 0; +	init_completion(&async->wait); + +	btrfs_init_work(&async->work, delayed_ref_async_start, +			NULL, NULL); + +	btrfs_queue_work(root->fs_info->extent_workers, &async->work); + +	if (wait) { +		wait_for_completion(&async->wait); +		ret = async->error; +		kfree(async); +		return ret; +	} +	return 0; +} +  /*   * this starts processing the delayed reference count updates and   * extent insertions we have queued up so far.  count can be @@ -2625,13 +2778,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  {  	struct rb_node *node;  	struct btrfs_delayed_ref_root *delayed_refs; -	struct btrfs_delayed_ref_node *ref; -	struct list_head cluster; +	struct btrfs_delayed_ref_head *head;  	int ret; -	u64 delayed_start;  	int run_all = count == (unsigned long)-1;  	int run_most = 0; -	int loops;  	/* We'll clean this up in btrfs_cleanup_transaction */  	if (trans->aborted) @@ -2640,133 +2790,41 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	if (root == root->fs_info->extent_root)  		root = root->fs_info->tree_root; -	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); -  	delayed_refs = &trans->transaction->delayed_refs; -	INIT_LIST_HEAD(&cluster);  	if (count == 0) { -		count = delayed_refs->num_entries * 2; +		count = atomic_read(&delayed_refs->num_entries) * 2;  		run_most = 1;  	} -	if (!run_all && !run_most) { -		int old; -		int seq = atomic_read(&delayed_refs->ref_seq); - -progress: -		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); -		if (old) { -			DEFINE_WAIT(__wait); -			if (delayed_refs->flushing || -			    !btrfs_should_throttle_delayed_refs(trans, root)) -				return 0; - -			prepare_to_wait(&delayed_refs->wait, &__wait, -					TASK_UNINTERRUPTIBLE); - -			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); -			if (old) { -				schedule(); -				finish_wait(&delayed_refs->wait, &__wait); - -				if (!refs_newer(delayed_refs, seq, 256)) -					goto progress; -				else -					return 0; -			} else { -				finish_wait(&delayed_refs->wait, &__wait); -				goto again; -			} -		} - -	} else { -		atomic_inc(&delayed_refs->procs_running_refs); -	} -  again: -	loops = 0; -	spin_lock(&delayed_refs->lock); -  #ifdef SCRAMBLE_DELAYED_REFS  	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);  #endif - -	while (1) { -		if (!(run_all || run_most) && -		    !btrfs_should_throttle_delayed_refs(trans, root)) -			break; - -		/* -		 * go find something we can process in the rbtree.  We start at -		 * the beginning of the tree, and then build a cluster -		 * of refs to process starting at the first one we are able to -		 * lock -		 */ -		delayed_start = delayed_refs->run_delayed_start; -		ret = btrfs_find_ref_cluster(trans, &cluster, -					     delayed_refs->run_delayed_start); -		if (ret) -			break; - -		ret = run_clustered_refs(trans, root, &cluster); -		if (ret < 0) { -			btrfs_release_ref_cluster(&cluster); -			spin_unlock(&delayed_refs->lock); -			btrfs_abort_transaction(trans, root, ret); -			atomic_dec(&delayed_refs->procs_running_refs); -			wake_up(&delayed_refs->wait); -			return ret; -		} - -		atomic_add(ret, &delayed_refs->ref_seq); - -		count -= min_t(unsigned long, ret, count); - -		if (count == 0) -			break; - -		if (delayed_start >= delayed_refs->run_delayed_start) { -			if (loops == 0) { -				/* -				 * btrfs_find_ref_cluster looped. let's do one -				 * more cycle. if we don't run any delayed ref -				 * during that cycle (because we can't because -				 * all of them are blocked), bail out. -				 */ -				loops = 1; -			} else { -				/* -				 * no runnable refs left, stop trying -				 */ -				BUG_ON(run_all); -				break; -			} -		} -		if (ret) { -			/* refs were run, let's reset staleness detection */ -			loops = 0; -		} +	ret = __btrfs_run_delayed_refs(trans, root, count); +	if (ret < 0) { +		btrfs_abort_transaction(trans, root, ret); +		return ret;  	}  	if (run_all) { -		if (!list_empty(&trans->new_bgs)) { -			spin_unlock(&delayed_refs->lock); +		if (!list_empty(&trans->new_bgs))  			btrfs_create_pending_block_groups(trans, root); -			spin_lock(&delayed_refs->lock); -		} -		node = rb_first(&delayed_refs->root); -		if (!node) +		spin_lock(&delayed_refs->lock); +		node = rb_first(&delayed_refs->href_root); +		if (!node) { +			spin_unlock(&delayed_refs->lock);  			goto out; +		}  		count = (unsigned long)-1;  		while (node) { -			ref = rb_entry(node, struct btrfs_delayed_ref_node, -				       rb_node); -			if (btrfs_delayed_ref_is_head(ref)) { -				struct btrfs_delayed_ref_head *head; +			head = rb_entry(node, struct btrfs_delayed_ref_head, +					href_node); +			if (btrfs_delayed_ref_is_head(&head->node)) { +				struct btrfs_delayed_ref_node *ref; -				head = btrfs_delayed_node_to_head(ref); +				ref = &head->node;  				atomic_inc(&ref->refs);  				spin_unlock(&delayed_refs->lock); @@ -2780,20 +2838,19 @@ again:  				btrfs_put_delayed_ref(ref);  				cond_resched();  				goto again; +			} else { +				WARN_ON(1);  			}  			node = rb_next(node);  		}  		spin_unlock(&delayed_refs->lock); -		schedule_timeout(1); +		cond_resched();  		goto again;  	}  out: -	atomic_dec(&delayed_refs->procs_running_refs); -	smp_mb(); -	if (waitqueue_active(&delayed_refs->wait)) -		wake_up(&delayed_refs->wait); - -	spin_unlock(&delayed_refs->lock); +	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); +	if (ret) +		return ret;  	assert_qgroups_uptodate(trans);  	return 0;  } @@ -2835,12 +2892,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,  	struct rb_node *node;  	int ret = 0; -	ret = -ENOENT;  	delayed_refs = &trans->transaction->delayed_refs;  	spin_lock(&delayed_refs->lock);  	head = btrfs_find_delayed_ref_head(trans, bytenr); -	if (!head) -		goto out; +	if (!head) { +		spin_unlock(&delayed_refs->lock); +		return 0; +	}  	if (!mutex_trylock(&head->mutex)) {  		atomic_inc(&head->node.refs); @@ -2857,40 +2915,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,  		btrfs_put_delayed_ref(&head->node);  		return -EAGAIN;  	} +	spin_unlock(&delayed_refs->lock); -	node = rb_prev(&head->node.rb_node); -	if (!node) -		goto out_unlock; - -	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - -	if (ref->bytenr != bytenr) -		goto out_unlock; - -	ret = 1; -	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) -		goto out_unlock; +	spin_lock(&head->lock); +	node = rb_first(&head->ref_root); +	while (node) { +		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); +		node = rb_next(node); -	data_ref = btrfs_delayed_node_to_data_ref(ref); +		/* If it's a shared ref we know a cross reference exists */ +		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { +			ret = 1; +			break; +		} -	node = rb_prev(node); -	if (node) { -		int seq = ref->seq; +		data_ref = btrfs_delayed_node_to_data_ref(ref); -		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); -		if (ref->bytenr == bytenr && ref->seq == seq) -			goto out_unlock; +		/* +		 * If our ref doesn't match the one we're currently looking at +		 * then we have a cross reference. +		 */ +		if (data_ref->root != root->root_key.objectid || +		    data_ref->objectid != objectid || +		    data_ref->offset != offset) { +			ret = 1; +			break; +		}  	} - -	if (data_ref->root != root->root_key.objectid || -	    data_ref->objectid != objectid || data_ref->offset != offset) -		goto out_unlock; - -	ret = 0; -out_unlock: +	spin_unlock(&head->lock);  	mutex_unlock(&head->mutex); -out: -	spin_unlock(&delayed_refs->lock);  	return ret;  } @@ -3004,7 +3057,7 @@ out:  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   int full_backref, int inc, int for_cow) +			   int full_backref, int inc, int no_quota)  {  	u64 bytenr;  	u64 num_bytes; @@ -3019,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,  			    u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	ref_root = btrfs_header_owner(buf);  	nritems = btrfs_header_nritems(buf);  	level = btrfs_header_level(buf); -	if (!root->ref_cows && level == 0) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)  		return 0;  	if (inc) @@ -3054,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			key.offset -= btrfs_file_extent_offset(buf, fi);  			ret = process_func(trans, root, bytenr, num_bytes,  					   parent, ref_root, key.objectid, -					   key.offset, for_cow); +					   key.offset, no_quota);  			if (ret)  				goto fail;  		} else { @@ -3062,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			num_bytes = btrfs_level_size(root, level - 1);  			ret = process_func(trans, root, bytenr, num_bytes,  					   parent, ref_root, level - 1, 0, -					   for_cow); +					   no_quota);  			if (ret)  				goto fail;  		} @@ -3073,15 +3130,15 @@ fail:  }  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);  }  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref, int for_cow) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);  }  static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3197,15 +3254,15 @@ again:  		if (ret)  			goto out_put; -		ret = btrfs_truncate_free_space_cache(root, trans, path, -						      inode); +		ret = btrfs_truncate_free_space_cache(root, trans, inode);  		if (ret)  			goto out_put;  	}  	spin_lock(&block_group->lock);  	if (block_group->cached != BTRFS_CACHE_FINISHED || -	    !btrfs_test_opt(root, SPACE_CACHE)) { +	    !btrfs_test_opt(root, SPACE_CACHE) || +	    block_group->delalloc_bytes) {  		/*  		 * don't bother trying to write stuff out _if_  		 * a) we're not cached, @@ -3318,10 +3375,9 @@ again:  		last = cache->key.objectid + cache->key.offset;  		err = write_one_cache_group(trans, root, path, cache); +		btrfs_put_block_group(cache);  		if (err) /* File system offline */  			goto out; - -		btrfs_put_block_group(cache);  	}  	while (1) { @@ -3389,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)  	return readonly;  } +static const char *alloc_name(u64 flags) +{ +	switch (flags) { +	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: +		return "mixed"; +	case BTRFS_BLOCK_GROUP_METADATA: +		return "metadata"; +	case BTRFS_BLOCK_GROUP_DATA: +		return "data"; +	case BTRFS_BLOCK_GROUP_SYSTEM: +		return "system"; +	default: +		WARN_ON(1); +		return "invalid-combination"; +	}; +} +  static int update_space_info(struct btrfs_fs_info *info, u64 flags,  			     u64 total_bytes, u64 bytes_used,  			     struct btrfs_space_info **space_info) @@ -3444,11 +3517,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	found->chunk_alloc = 0;  	found->flush = 0;  	init_waitqueue_head(&found->wait); + +	ret = kobject_init_and_add(&found->kobj, &space_info_ktype, +				    info->space_info_kobj, "%s", +				    alloc_name(found->flags)); +	if (ret) { +		kfree(found); +		return ret; +	} +  	*space_info = found;  	list_add_rcu(&found->list, &info->space_info);  	if (flags & BTRFS_BLOCK_GROUP_DATA)  		info->data_sinfo = found; -	return 0; + +	return ret;  }  static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -3556,11 +3639,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  	return extended_to_chunk(flags | tmp);  } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)  {  	unsigned seq; +	u64 flags;  	do { +		flags = orig_flags;  		seq = read_seqbegin(&root->fs_info->profiles_lock);  		if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -3605,10 +3690,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)  	/* make sure bytes are sectorsize aligned */  	bytes = ALIGN(bytes, root->sectorsize); -	if (root == root->fs_info->tree_root || -	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { -		alloc_chunk = 0; +	if (btrfs_is_free_space_inode(inode)) {  		committed = 1; +		ASSERT(current->journal_info);  	}  	data_sinfo = fs_info->data_sinfo; @@ -3636,6 +3720,16 @@ again:  			spin_unlock(&data_sinfo->lock);  alloc:  			alloc_target = btrfs_get_alloc_profile(root, 1); +			/* +			 * It is ugly that we don't call nolock join +			 * transaction for the free space inode case here. +			 * But it is safe because we only do the data space +			 * reservation for the free space cache in the +			 * transaction context, the common join transaction +			 * just increase the counter of the current transaction +			 * handler, doesn't try to acquire the trans_lock of +			 * the fs. +			 */  			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans))  				return PTR_ERR(trans); @@ -3681,6 +3775,9 @@ commit_trans:  			goto again;  		} +		trace_btrfs_space_reservation(root->fs_info, +					      "space_info:enospc", +					      data_sinfo->flags, bytes, 1);  		return -ENOSPC;  	}  	data_sinfo->bytes_may_use += bytes; @@ -3974,7 +4071,7 @@ static int can_overcommit(struct btrfs_root *root,  }  static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, -					 unsigned long nr_pages) +					 unsigned long nr_pages, int nr_items)  {  	struct super_block *sb = root->fs_info->sb; @@ -3989,12 +4086,26 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,  		 * the filesystem is readonly(all dirty pages are written to  		 * the disk).  		 */ -		btrfs_start_all_delalloc_inodes(root->fs_info, 0); +		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);  		if (!current->journal_info) -			btrfs_wait_all_ordered_extents(root->fs_info); +			btrfs_wait_ordered_roots(root->fs_info, nr_items);  	}  } +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ +	u64 bytes; +	int nr; + +	bytes = btrfs_calc_trans_metadata_size(root, 1); +	nr = (int)div64_u64(to_reclaim, bytes); +	if (!nr) +		nr = 1; +	return nr; +} + +#define EXTENT_SIZE_PER_ITEM	(256 * 1024) +  /*   * shrink metadata reservation for delalloc   */ @@ -4007,35 +4118,51 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	u64 delalloc_bytes;  	u64 max_reclaim;  	long time_left; -	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; -	int loops = 0; +	unsigned long nr_pages; +	int loops; +	int items;  	enum btrfs_reserve_flush_enum flush; +	/* Calc the number of the pages we need flush for space reservation */ +	items = calc_reclaim_items_nr(root, to_reclaim); +	to_reclaim = items * EXTENT_SIZE_PER_ITEM; +  	trans = (struct btrfs_trans_handle *)current->journal_info;  	block_rsv = &root->fs_info->delalloc_block_rsv;  	space_info = block_rsv->space_info; -	smp_mb();  	delalloc_bytes = percpu_counter_sum_positive(  						&root->fs_info->delalloc_bytes);  	if (delalloc_bytes == 0) {  		if (trans)  			return; -		btrfs_wait_all_ordered_extents(root->fs_info); +		if (wait_ordered) +			btrfs_wait_ordered_roots(root->fs_info, items);  		return;  	} +	loops = 0;  	while (delalloc_bytes && loops < 3) {  		max_reclaim = min(delalloc_bytes, to_reclaim);  		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; -		btrfs_writeback_inodes_sb_nr(root, nr_pages); +		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);  		/*  		 * We need to wait for the async pages to actually start before  		 * we do anything.  		 */ -		wait_event(root->fs_info->async_submit_wait, -			   !atomic_read(&root->fs_info->async_delalloc_pages)); +		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); +		if (!max_reclaim) +			goto skip_async; + +		if (max_reclaim <= nr_pages) +			max_reclaim = 0; +		else +			max_reclaim -= nr_pages; +		wait_event(root->fs_info->async_submit_wait, +			   atomic_read(&root->fs_info->async_delalloc_pages) <= +			   (int)max_reclaim); +skip_async:  		if (!trans)  			flush = BTRFS_RESERVE_FLUSH_ALL;  		else @@ -4049,13 +4176,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  		loops++;  		if (wait_ordered && !trans) { -			btrfs_wait_all_ordered_extents(root->fs_info); +			btrfs_wait_ordered_roots(root->fs_info, items);  		} else {  			time_left = schedule_timeout_killable(1);  			if (time_left)  				break;  		} -		smp_mb();  		delalloc_bytes = percpu_counter_sum_positive(  						&root->fs_info->delalloc_bytes);  	} @@ -4086,13 +4212,9 @@ static int may_commit_transaction(struct btrfs_root *root,  		goto commit;  	/* See if there is enough pinned space to make this reservation */ -	spin_lock(&space_info->lock);  	if (percpu_counter_compare(&space_info->total_bytes_pinned, -				   bytes) >= 0) { -		spin_unlock(&space_info->lock); +				   bytes) >= 0)  		goto commit; -	} -	spin_unlock(&space_info->lock);  	/*  	 * See if there is some space in the delayed insertion reservation for @@ -4101,16 +4223,13 @@ static int may_commit_transaction(struct btrfs_root *root,  	if (space_info != delayed_rsv->space_info)  		return -ENOSPC; -	spin_lock(&space_info->lock);  	spin_lock(&delayed_rsv->lock);  	if (percpu_counter_compare(&space_info->total_bytes_pinned,  				   bytes - delayed_rsv->size) >= 0) {  		spin_unlock(&delayed_rsv->lock); -		spin_unlock(&space_info->lock);  		return -ENOSPC;  	}  	spin_unlock(&delayed_rsv->lock); -	spin_unlock(&space_info->lock);  commit:  	trans = btrfs_join_transaction(root); @@ -4140,16 +4259,11 @@ static int flush_space(struct btrfs_root *root,  	switch (state) {  	case FLUSH_DELAYED_ITEMS_NR:  	case FLUSH_DELAYED_ITEMS: -		if (state == FLUSH_DELAYED_ITEMS_NR) { -			u64 bytes = btrfs_calc_trans_metadata_size(root, 1); - -			nr = (int)div64_u64(num_bytes, bytes); -			if (!nr) -				nr = 1; -			nr *= 2; -		} else { +		if (state == FLUSH_DELAYED_ITEMS_NR) +			nr = calc_reclaim_items_nr(root, num_bytes) * 2; +		else  			nr = -1; -		} +  		trans = btrfs_join_transaction(root);  		if (IS_ERR(trans)) {  			ret = PTR_ERR(trans); @@ -4160,7 +4274,7 @@ static int flush_space(struct btrfs_root *root,  		break;  	case FLUSH_DELALLOC:  	case FLUSH_DELALLOC_WAIT: -		shrink_delalloc(root, num_bytes, orig_bytes, +		shrink_delalloc(root, num_bytes * 2, orig_bytes,  				state == FLUSH_DELALLOC_WAIT);  		break;  	case ALLOC_CHUNK: @@ -4186,6 +4300,104 @@ static int flush_space(struct btrfs_root *root,  	return ret;  } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, +				 struct btrfs_space_info *space_info) +{ +	u64 used; +	u64 expected; +	u64 to_reclaim; + +	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, +				16 * 1024 * 1024); +	spin_lock(&space_info->lock); +	if (can_overcommit(root, space_info, to_reclaim, +			   BTRFS_RESERVE_FLUSH_ALL)) { +		to_reclaim = 0; +		goto out; +	} + +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (can_overcommit(root, space_info, 1024 * 1024, +			   BTRFS_RESERVE_FLUSH_ALL)) +		expected = div_factor_fine(space_info->total_bytes, 95); +	else +		expected = div_factor_fine(space_info->total_bytes, 90); + +	if (used > expected) +		to_reclaim = used - expected; +	else +		to_reclaim = 0; +	to_reclaim = min(to_reclaim, space_info->bytes_may_use + +				     space_info->bytes_reserved); +out: +	spin_unlock(&space_info->lock); + +	return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, +					struct btrfs_fs_info *fs_info, u64 used) +{ +	return (used >= div_factor_fine(space_info->total_bytes, 98) && +		!btrfs_fs_closing(fs_info) && +		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, +				       struct btrfs_fs_info *fs_info) +{ +	u64 used; + +	spin_lock(&space_info->lock); +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (need_do_async_reclaim(space_info, fs_info, used)) { +		spin_unlock(&space_info->lock); +		return 1; +	} +	spin_unlock(&space_info->lock); + +	return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ +	struct btrfs_fs_info *fs_info; +	struct btrfs_space_info *space_info; +	u64 to_reclaim; +	int flush_state; + +	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); +	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, +						      space_info); +	if (!to_reclaim) +		return; + +	flush_state = FLUSH_DELAYED_ITEMS_NR; +	do { +		flush_space(fs_info->fs_root, space_info, to_reclaim, +			    to_reclaim, flush_state); +		flush_state++; +		if (!btrfs_need_do_async_reclaim(space_info, fs_info)) +			return; +	} while (flush_state <= COMMIT_TRANS); + +	if (btrfs_need_do_async_reclaim(space_info, fs_info)) +		queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ +	INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} +  /**   * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space   * @root - the root we're allocating for @@ -4293,8 +4505,13 @@ again:  	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {  		flushing = true;  		space_info->flush = 1; +	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { +		used += orig_bytes; +		if (need_do_async_reclaim(space_info, root->fs_info, used) && +		    !work_busy(&root->fs_info->async_reclaim_work)) +			queue_work(system_unbound_wq, +				   &root->fs_info->async_reclaim_work);  	} -  	spin_unlock(&space_info->lock);  	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4332,6 +4549,10 @@ out:  		    !block_rsv_use_bytes(global_rsv, orig_bytes))  			ret = 0;  	} +	if (ret == -ENOSPC) +		trace_btrfs_space_reservation(root->fs_info, +					      "space_info:enospc", +					      space_info->flags, orig_bytes, 1);  	if (flushing) {  		spin_lock(&space_info->lock);  		space_info->flush = 0; @@ -4347,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv(  {  	struct btrfs_block_rsv *block_rsv = NULL; -	if (root->ref_cows) +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		block_rsv = trans->block_rsv;  	if (root == root->fs_info->csum_root && trans->adding_csums) @@ -4584,7 +4805,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,  			     u64 num_bytes)  {  	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; -	if (global_rsv->full || global_rsv == block_rsv || +	if (global_rsv == block_rsv ||  	    block_rsv->space_info != global_rsv->space_info)  		global_rsv = NULL;  	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -4986,7 +5207,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);  	if (to_reserve) -		trace_btrfs_space_reservation(root->fs_info,"delalloc", +		trace_btrfs_space_reservation(root->fs_info, "delalloc",  					      btrfs_ino(inode), to_reserve, 1);  	block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -5264,6 +5485,8 @@ static int pin_down_extent(struct btrfs_root *root,  	set_extent_dirty(root->fs_info->pinned_extents, bytenr,  			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); +	if (reserved) +		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);  	return 0;  } @@ -5392,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,   * @cache:	The cache we are manipulating   * @num_bytes:	The number of bytes in question   * @reserve:	One of the reservation enums + * @delalloc:   The blocks are allocated for the delalloc write   *   * This is called by the allocator when it reserves space, or by somebody who is   * freeing space that was never actually used on disk.  For example if you @@ -5410,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,   * succeeds.   */  static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				       u64 num_bytes, int reserve) +				       u64 num_bytes, int reserve, int delalloc)  {  	struct btrfs_space_info *space_info = cache->space_info;  	int ret = 0; @@ -5429,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,  						num_bytes, 0);  				space_info->bytes_may_use -= num_bytes;  			} + +			if (delalloc) +				cache->delalloc_bytes += num_bytes;  		}  	} else {  		if (cache->ro)  			space_info->bytes_readonly += num_bytes;  		cache->reserved -= num_bytes;  		space_info->bytes_reserved -= num_bytes; + +		if (delalloc) +			cache->delalloc_bytes -= num_bytes;  	}  	spin_unlock(&cache->lock);  	spin_unlock(&space_info->lock); @@ -5448,9 +5678,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  	struct btrfs_caching_control *next;  	struct btrfs_caching_control *caching_ctl;  	struct btrfs_block_group_cache *cache; -	struct btrfs_space_info *space_info; -	down_write(&fs_info->extent_commit_sem); +	down_write(&fs_info->commit_root_sem);  	list_for_each_entry_safe(caching_ctl, next,  				 &fs_info->caching_block_groups, list) { @@ -5469,10 +5698,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  	else  		fs_info->pinned_extents = &fs_info->freed_extents[0]; -	up_write(&fs_info->extent_commit_sem); - -	list_for_each_entry_rcu(space_info, &fs_info->space_info, list) -		percpu_counter_set(&space_info->total_bytes_pinned, 0); +	up_write(&fs_info->commit_root_sem);  	update_global_block_rsv(fs_info);  } @@ -5511,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  		spin_lock(&cache->lock);  		cache->pinned -= len;  		space_info->bytes_pinned -= len; +		percpu_counter_add(&space_info->total_bytes_pinned, -len);  		if (cache->ro) {  			space_info->bytes_readonly += len;  			readonly = true; @@ -5597,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extent_op) +				struct btrfs_delayed_extent_op *extent_op, +				int no_quota)  {  	struct btrfs_key key;  	struct btrfs_path *path; @@ -5613,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	int num_to_del = 1;  	u32 item_size;  	u64 refs; +	int last_ref = 0; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); +	if (!info->quota_enabled || !is_fstree(root_objectid)) +		no_quota = 1; +  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -5663,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			BUG_ON(iref);  			ret = remove_extent_backref(trans, extent_root, path,  						    NULL, refs_to_drop, -						    is_data); +						    is_data, &last_ref);  			if (ret) {  				btrfs_abort_transaction(trans, extent_root, ret);  				goto out; @@ -5698,6 +5931,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			if (ret > 0 && skinny_metadata) {  				skinny_metadata = false; +				key.objectid = bytenr;  				key.type = BTRFS_EXTENT_ITEM_KEY;  				key.offset = num_bytes;  				btrfs_release_path(path); @@ -5718,13 +5952,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  			extent_slot = path->slots[0];  		} -	} else if (ret == -ENOENT) { +	} else if (WARN_ON(ret == -ENOENT)) {  		btrfs_print_leaf(extent_root, path->nodes[0]); -		WARN_ON(1);  		btrfs_err(info,  			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",  			bytenr, parent, root_objectid, owner_objectid,  			owner_offset); +		btrfs_abort_transaction(trans, extent_root, ret); +		goto out;  	} else {  		btrfs_abort_transaction(trans, extent_root, ret);  		goto out; @@ -5780,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	refs = btrfs_extent_refs(leaf, ei);  	if (refs < refs_to_drop) {  		btrfs_err(info, "trying to drop %d refs but we only have %Lu " -			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr); +			  "for bytenr %Lu", refs_to_drop, refs, bytenr);  		ret = -EINVAL;  		btrfs_abort_transaction(trans, extent_root, ret);  		goto out; @@ -5788,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	refs -= refs_to_drop;  	if (refs > 0) { +		type = BTRFS_QGROUP_OPER_SUB_SHARED;  		if (extent_op)  			__run_delayed_extent_op(extent_op, leaf, ei);  		/* @@ -5803,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		if (found_extent) {  			ret = remove_extent_backref(trans, extent_root, path,  						    iref, refs_to_drop, -						    is_data); +						    is_data, &last_ref);  			if (ret) {  				btrfs_abort_transaction(trans, extent_root, ret);  				goto out; @@ -5824,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  		} +		last_ref = 1;  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],  				      num_to_del);  		if (ret) { @@ -5846,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			goto out;  		}  	} +	btrfs_release_path(path); + +	/* Deal with the quota accounting */ +	if (!ret && last_ref && !no_quota) { +		int mod_seq = 0; + +		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && +		    type == BTRFS_QGROUP_OPER_SUB_SHARED) +			mod_seq = 1; + +		ret = btrfs_qgroup_record_ref(trans, info, root_objectid, +					      bytenr, num_bytes, type, +					      mod_seq); +	}  out:  	btrfs_free_path(path);  	return ret; @@ -5862,24 +6113,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  {  	struct btrfs_delayed_ref_head *head;  	struct btrfs_delayed_ref_root *delayed_refs; -	struct btrfs_delayed_ref_node *ref; -	struct rb_node *node;  	int ret = 0;  	delayed_refs = &trans->transaction->delayed_refs;  	spin_lock(&delayed_refs->lock);  	head = btrfs_find_delayed_ref_head(trans, bytenr);  	if (!head) -		goto out; +		goto out_delayed_unlock; -	node = rb_prev(&head->node.rb_node); -	if (!node) -		goto out; - -	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - -	/* there are still entries for this ref, we can't drop it */ -	if (ref->bytenr == bytenr) +	spin_lock(&head->lock); +	if (rb_first(&head->ref_root))  		goto out;  	if (head->extent_op) { @@ -5901,19 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	 * ahead and process it.  	 */  	head->node.in_tree = 0; -	rb_erase(&head->node.rb_node, &delayed_refs->root); +	rb_erase(&head->href_node, &delayed_refs->href_root); -	delayed_refs->num_entries--; +	atomic_dec(&delayed_refs->num_entries);  	/*  	 * we don't take a ref on the node because we're removing it from the  	 * tree, so we just steal the ref the tree was holding.  	 */  	delayed_refs->num_heads--; -	if (list_empty(&head->cluster)) +	if (head->processing == 0)  		delayed_refs->num_heads_ready--; - -	list_del_init(&head->cluster); +	head->processing = 0; +	spin_unlock(&head->lock);  	spin_unlock(&delayed_refs->lock);  	BUG_ON(head->extent_op); @@ -5924,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	btrfs_put_delayed_ref(&head->node);  	return ret;  out: +	spin_unlock(&head->lock); + +out_delayed_unlock:  	spin_unlock(&delayed_refs->lock);  	return 0;  } @@ -5966,7 +6212,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));  		btrfs_add_free_space(cache, buf->start, buf->len); -		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); +		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); +		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);  		pin = 0;  	}  out: @@ -5986,11 +6233,15 @@ out:  /* Can return -ENOMEM */  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, -		      u64 owner, u64 offset, int for_cow) +		      u64 owner, u64 offset, int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);  	/* @@ -6006,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,  					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_DROP_DELAYED_REF, NULL, for_cow); +					BTRFS_DROP_DELAYED_REF, NULL, no_quota);  	} else {  		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,  						num_bytes,  						parent, root_objectid, owner,  						offset, BTRFS_DROP_DELAYED_REF, -						NULL, for_cow); +						NULL, no_quota);  	}  	return ret;  } @@ -6090,11 +6341,29 @@ int __get_raid_index(u64 flags)  	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */  } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache)  {  	return __get_raid_index(cache->flags);  } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { +	[BTRFS_RAID_RAID10]	= "raid10", +	[BTRFS_RAID_RAID1]	= "raid1", +	[BTRFS_RAID_DUP]	= "dup", +	[BTRFS_RAID_RAID0]	= "raid0", +	[BTRFS_RAID_SINGLE]	= "single", +	[BTRFS_RAID_RAID5]	= "raid5", +	[BTRFS_RAID_RAID6]	= "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ +	if (type >= BTRFS_NR_RAID_TYPES) +		return NULL; + +	return btrfs_raid_type_names[type]; +} +  enum btrfs_loop_type {  	LOOP_CACHING_NOWAIT = 0,  	LOOP_CACHING_WAIT = 1, @@ -6102,6 +6371,70 @@ enum btrfs_loop_type {  	LOOP_NO_EMPTY_SIZE = 3,  }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, +		       int delalloc) +{ +	if (delalloc) +		down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, +		       int delalloc) +{ +	btrfs_get_block_group(cache); +	if (delalloc) +		down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, +		   struct btrfs_free_cluster *cluster, +		   int delalloc) +{ +	struct btrfs_block_group_cache *used_bg; +	bool locked = false; +again: +	spin_lock(&cluster->refill_lock); +	if (locked) { +		if (used_bg == cluster->block_group) +			return used_bg; + +		up_read(&used_bg->data_rwsem); +		btrfs_put_block_group(used_bg); +	} + +	used_bg = cluster->block_group; +	if (!used_bg) +		return NULL; + +	if (used_bg == block_group) +		return used_bg; + +	btrfs_get_block_group(used_bg); + +	if (!delalloc) +		return used_bg; + +	if (down_read_trylock(&used_bg->data_rwsem)) +		return used_bg; + +	spin_unlock(&cluster->refill_lock); +	down_read(&used_bg->data_rwsem); +	locked = true; +	goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, +			 int delalloc) +{ +	if (delalloc) +		up_read(&cache->data_rwsem); +	btrfs_put_block_group(cache); +} +  /*   * walks the btree of allocated extents and find a hole of a given size.   * The key ins is changed to record the hole: @@ -6116,13 +6449,12 @@ enum btrfs_loop_type {  static noinline int find_free_extent(struct btrfs_root *orig_root,  				     u64 num_bytes, u64 empty_size,  				     u64 hint_byte, struct btrfs_key *ins, -				     u64 flags) +				     u64 flags, int delalloc)  {  	int ret = 0;  	struct btrfs_root *root = orig_root->fs_info->extent_root;  	struct btrfs_free_cluster *last_ptr = NULL;  	struct btrfs_block_group_cache *block_group = NULL; -	struct btrfs_block_group_cache *used_block_group;  	u64 search_start = 0;  	u64 max_extent_size = 0;  	int empty_cluster = 2 * 1024 * 1024; @@ -6131,7 +6463,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,  	int index = __get_raid_index(flags);  	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?  		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; -	bool found_uncached_bg = false;  	bool failed_cluster_refill = false;  	bool failed_alloc = false;  	bool use_cluster = true; @@ -6184,7 +6515,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,  	if (search_start == hint_byte) {  		block_group = btrfs_lookup_block_group(root->fs_info,  						       search_start); -		used_block_group = block_group;  		/*  		 * we don't want to use the block group if it doesn't match our  		 * allocation bits, or if its not cached. @@ -6207,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,  				up_read(&space_info->groups_sem);  			} else {  				index = get_block_group_index(block_group); +				btrfs_lock_block_group(block_group, delalloc);  				goto have_block_group;  			}  		} else if (block_group) { @@ -6221,8 +6552,7 @@ search:  		u64 offset;  		int cached; -		used_block_group = block_group; -		btrfs_get_block_group(block_group); +		btrfs_grab_block_group(block_group, delalloc);  		search_start = block_group->key.objectid;  		/* @@ -6249,7 +6579,6 @@ search:  have_block_group:  		cached = block_group_cache_done(block_group);  		if (unlikely(!cached)) { -			found_uncached_bg = true;  			ret = cache_block_group(block_group, 0);  			BUG_ON(ret < 0);  			ret = 0; @@ -6265,23 +6594,22 @@ have_block_group:  		 * lets look there  		 */  		if (last_ptr) { +			struct btrfs_block_group_cache *used_block_group;  			unsigned long aligned_cluster;  			/*  			 * the refill lock keeps out other  			 * people trying to start a new cluster  			 */ -			spin_lock(&last_ptr->refill_lock); -			used_block_group = last_ptr->block_group; -			if (used_block_group != block_group && -			    (!used_block_group || -			     used_block_group->ro || -			     !block_group_bits(used_block_group, flags))) { -				used_block_group = block_group; +			used_block_group = btrfs_lock_cluster(block_group, +							      last_ptr, +							      delalloc); +			if (!used_block_group)  				goto refill_cluster; -			} -			if (used_block_group != block_group) -				btrfs_get_block_group(used_block_group); +			if (used_block_group != block_group && +			    (used_block_group->ro || +			     !block_group_bits(used_block_group, flags))) +				goto release_cluster;  			offset = btrfs_alloc_from_cluster(used_block_group,  						last_ptr, @@ -6292,17 +6620,18 @@ have_block_group:  				/* we have a block, we're done */  				spin_unlock(&last_ptr->refill_lock);  				trace_btrfs_reserve_extent_cluster(root, -					block_group, search_start, num_bytes); +						used_block_group, +						search_start, num_bytes); +				if (used_block_group != block_group) { +					btrfs_release_block_group(block_group, +								  delalloc); +					block_group = used_block_group; +				}  				goto checks;  			}  			WARN_ON(last_ptr->block_group != used_block_group); -			if (used_block_group != block_group) { -				btrfs_put_block_group(used_block_group); -				used_block_group = block_group; -			} -refill_cluster: -			BUG_ON(used_block_group != block_group); +release_cluster:  			/* If we are on LOOP_NO_EMPTY_SIZE, we can't  			 * set up a new clusters, so lets just skip it  			 * and let the allocator find whatever block @@ -6319,8 +6648,10 @@ refill_cluster:  			 * succeeding in the unclustered  			 * allocation.  */  			if (loop >= LOOP_NO_EMPTY_SIZE && -			    last_ptr->block_group != block_group) { +			    used_block_group != block_group) {  				spin_unlock(&last_ptr->refill_lock); +				btrfs_release_block_group(used_block_group, +							  delalloc);  				goto unclustered_alloc;  			} @@ -6330,6 +6661,10 @@ refill_cluster:  			 */  			btrfs_return_cluster_to_free_space(NULL, last_ptr); +			if (used_block_group != block_group) +				btrfs_release_block_group(used_block_group, +							  delalloc); +refill_cluster:  			if (loop >= LOOP_NO_EMPTY_SIZE) {  				spin_unlock(&last_ptr->refill_lock);  				goto unclustered_alloc; @@ -6421,25 +6756,25 @@ unclustered_alloc:  			goto loop;  		}  checks: -		search_start = stripe_align(root, used_block_group, +		search_start = stripe_align(root, block_group,  					    offset, num_bytes);  		/* move on to the next group */  		if (search_start + num_bytes > -		    used_block_group->key.objectid + used_block_group->key.offset) { -			btrfs_add_free_space(used_block_group, offset, num_bytes); +		    block_group->key.objectid + block_group->key.offset) { +			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop;  		}  		if (offset < search_start) -			btrfs_add_free_space(used_block_group, offset, +			btrfs_add_free_space(block_group, offset,  					     search_start - offset);  		BUG_ON(offset > search_start); -		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, -						  alloc_type); +		ret = btrfs_update_reserved_bytes(block_group, num_bytes, +						  alloc_type, delalloc);  		if (ret == -EAGAIN) { -			btrfs_add_free_space(used_block_group, offset, num_bytes); +			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop;  		} @@ -6449,17 +6784,13 @@ checks:  		trace_btrfs_reserve_extent(orig_root, block_group,  					   search_start, num_bytes); -		if (used_block_group != block_group) -			btrfs_put_block_group(used_block_group); -		btrfs_put_block_group(block_group); +		btrfs_release_block_group(block_group, delalloc);  		break;  loop:  		failed_cluster_refill = false;  		failed_alloc = false;  		BUG_ON(index != get_block_group_index(block_group)); -		if (used_block_group != block_group) -			btrfs_put_block_group(used_block_group); -		btrfs_put_block_group(block_group); +		btrfs_release_block_group(block_group, delalloc);  	}  	up_read(&space_info->groups_sem); @@ -6482,8 +6813,14 @@ loop:  		loop++;  		if (loop == LOOP_ALLOC_CHUNK) {  			struct btrfs_trans_handle *trans; +			int exist = 0; + +			trans = current->journal_info; +			if (trans) +				exist = 1; +			else +				trans = btrfs_join_transaction(root); -			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans)) {  				ret = PTR_ERR(trans);  				goto out; @@ -6500,7 +6837,8 @@ loop:  							root, ret);  			else  				ret = 0; -			btrfs_end_transaction(trans, root); +			if (!exist) +				btrfs_end_transaction(trans, root);  			if (ret)  				goto out;  		} @@ -6529,12 +6867,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  	int index = 0;  	spin_lock(&info->lock); -	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", +	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",  	       info->flags,  	       info->total_bytes - info->bytes_used - info->bytes_pinned -  	       info->bytes_reserved - info->bytes_readonly,  	       (info->full) ? "" : "not "); -	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " +	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "  	       "reserved=%llu, may_use=%llu, readonly=%llu\n",  	       info->total_bytes, info->bytes_used, info->bytes_pinned,  	       info->bytes_reserved, info->bytes_may_use, @@ -6548,7 +6886,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  again:  	list_for_each_entry(cache, &info->block_groups[index], list) {  		spin_lock(&cache->lock); -		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", +		printk(KERN_INFO "BTRFS: " +			   "block group %llu has %llu bytes, " +			   "%llu used %llu pinned %llu reserved %s\n",  		       cache->key.objectid, cache->key.offset,  		       btrfs_block_group_used(&cache->item), cache->pinned,  		       cache->reserved, cache->ro ? "[readonly]" : ""); @@ -6563,7 +6903,7 @@ again:  int btrfs_reserve_extent(struct btrfs_root *root,  			 u64 num_bytes, u64 min_alloc_size,  			 u64 empty_size, u64 hint_byte, -			 struct btrfs_key *ins, int is_data) +			 struct btrfs_key *ins, int is_data, int delalloc)  {  	bool final_tried = false;  	u64 flags; @@ -6573,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,  again:  	WARN_ON(num_bytes < root->sectorsize);  	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, -			       flags); +			       flags, delalloc);  	if (ret == -ENOSPC) {  		if (!final_tried && ins->offset) { @@ -6594,13 +6934,12 @@ again:  		}  	} -	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); -  	return ret;  }  static int __btrfs_free_reserved_extent(struct btrfs_root *root, -					u64 start, u64 len, int pin) +					u64 start, u64 len, +					int pin, int delalloc)  {  	struct btrfs_block_group_cache *cache;  	int ret = 0; @@ -6619,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,  		pin_down_extent(root, cache, start, len, 1);  	else {  		btrfs_add_free_space(cache, start, len); -		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); +		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);  	}  	btrfs_put_block_group(cache); @@ -6629,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,  }  int btrfs_free_reserved_extent(struct btrfs_root *root, -					u64 start, u64 len) +			       u64 start, u64 len, int delalloc)  { -	return __btrfs_free_reserved_extent(root, start, len, 0); +	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);  }  int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,  				       u64 start, u64 len)  { -	return __btrfs_free_reserved_extent(root, start, len, 1); +	return __btrfs_free_reserved_extent(root, start, len, 1, 0);  }  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -6701,12 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); +	/* Always set parent to 0 here since its exclusive anyway. */ +	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +				      ins->objectid, ins->offset, +				      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +	if (ret) +		return ret; +  	ret = update_block_group(root, ins->objectid, ins->offset, 1);  	if (ret) { /* -ENOENT, logic error */  		btrfs_err(fs_info, "update block group failed for %llu %llu",  			ins->objectid, ins->offset);  		BUG();  	} +	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);  	return ret;  } @@ -6714,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins) +				     int level, struct btrfs_key *ins, +				     int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -6724,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	u32 size = sizeof(*extent_item) + sizeof(*iref); +	u64 num_bytes = ins->offset;  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); @@ -6731,13 +7080,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  		size += sizeof(*block_info);  	path = btrfs_alloc_path(); -	if (!path) +	if (!path) { +		btrfs_free_and_pin_reserved_extent(root, ins->objectid, +						   root->leafsize);  		return -ENOMEM; +	}  	path->leave_spinning = 1;  	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,  				      ins, size);  	if (ret) { +		btrfs_free_and_pin_reserved_extent(root, ins->objectid, +						   root->leafsize);  		btrfs_free_path(path);  		return ret;  	} @@ -6752,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	if (skinny_metadata) {  		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); +		num_bytes = root->leafsize;  	} else {  		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);  		btrfs_set_tree_block_key(leaf, block_info, key); @@ -6773,12 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_free_path(path); +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      ins->objectid, num_bytes, +					      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +		if (ret) +			return ret; +	} +  	ret = update_block_group(root, ins->objectid, root->leafsize, 1);  	if (ret) { /* -ENOENT, logic error */  		btrfs_err(fs_info, "update block group failed for %llu %llu",  			ins->objectid, ins->offset);  		BUG();  	} + +	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);  	return ret;  } @@ -6826,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  		return -EINVAL;  	ret = btrfs_update_reserved_bytes(block_group, ins->offset, -					  RESERVE_ALLOC_NO_ACCOUNT); +					  RESERVE_ALLOC_NO_ACCOUNT, 0);  	BUG_ON(ret); /* logic error */  	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,  					 0, owner, offset, ins, 1); @@ -6905,7 +7270,7 @@ again:  				/*DEFAULT_RATELIMIT_BURST*/ 1);  		if (__ratelimit(&_rs))  			WARN(1, KERN_DEBUG -				"btrfs: block rsv returned %d\n", ret); +				"BTRFS: block rsv returned %d\n", ret);  	}  try_reserve:  	ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -6954,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,  						 SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { +		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, +					    blocksize, level); +		if (!IS_ERR(buf)) +			root->alloc_bytenr += blocksize; +		return buf; +	} +#endif  	block_rsv = use_block_rsv(trans, root, blocksize);  	if (IS_ERR(block_rsv))  		return ERR_CAST(block_rsv);  	ret = btrfs_reserve_extent(root, blocksize, blocksize, -				   empty_size, hint, &ins, 0); +				   empty_size, hint, &ins, 0, 0);  	if (ret) {  		unuse_block_rsv(root->fs_info, block_rsv, blocksize);  		return ERR_PTR(ret); @@ -7653,7 +8027,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  			btrfs_end_transaction_throttle(trans, tree_root);  			if (!for_reloc && btrfs_need_cleaner_sleep(root)) { -				pr_debug("btrfs: drop snapshot early exit\n"); +				pr_debug("BTRFS: drop snapshot early exit\n");  				err = -EAGAIN;  				goto out_free;  			} @@ -7695,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		}  	} -	if (root->in_radix) { +	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {  		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);  	} else {  		free_extent_buffer(root->node); @@ -7718,7 +8092,7 @@ out:  	 */  	if (!for_reloc && root_dropped == false)  		btrfs_add_dead_root(root); -	if (err) +	if (err && err != -EAGAIN)  		btrfs_std_error(root->fs_info, err);  	return err;  } @@ -7983,7 +8357,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)  	spin_lock(&sinfo->lock); -	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) +	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)  		if (!list_empty(&sinfo->block_groups[i]))  			free_bytes += __btrfs_get_ro_block_group_free_space(  						&sinfo->block_groups[i]); @@ -8222,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  	struct btrfs_caching_control *caching_ctl;  	struct rb_node *n; -	down_write(&info->extent_commit_sem); +	down_write(&info->commit_root_sem);  	while (!list_empty(&info->caching_block_groups)) {  		caching_ctl = list_entry(info->caching_block_groups.next,  					 struct btrfs_caching_control, list);  		list_del(&caching_ctl->list);  		put_caching_control(caching_ctl);  	} -	up_write(&info->extent_commit_sem); +	up_write(&info->commit_root_sem);  	spin_lock(&info->block_group_cache_lock);  	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8271,21 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  	release_global_block_rsv(info); -	while(!list_empty(&info->space_info)) { +	while (!list_empty(&info->space_info)) { +		int i; +  		space_info = list_entry(info->space_info.next,  					struct btrfs_space_info,  					list);  		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { -			if (space_info->bytes_pinned > 0 || +			if (WARN_ON(space_info->bytes_pinned > 0 ||  			    space_info->bytes_reserved > 0 || -			    space_info->bytes_may_use > 0) { -				WARN_ON(1); +			    space_info->bytes_may_use > 0)) {  				dump_space_info(space_info, 0, 0);  			}  		} -		percpu_counter_destroy(&space_info->total_bytes_pinned);  		list_del(&space_info->list); -		kfree(space_info); +		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { +			struct kobject *kobj; +			kobj = space_info->block_group_kobjs[i]; +			space_info->block_group_kobjs[i] = NULL; +			if (kobj) { +				kobject_del(kobj); +				kobject_put(kobj); +			} +		} +		kobject_del(&space_info->kobj); +		kobject_put(&space_info->kobj);  	}  	return 0;  } @@ -8294,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info,  			       struct btrfs_block_group_cache *cache)  {  	int index = get_block_group_index(cache); +	bool first = false;  	down_write(&space_info->groups_sem); +	if (list_empty(&space_info->block_groups[index])) +		first = true;  	list_add_tail(&cache->list, &space_info->block_groups[index]);  	up_write(&space_info->groups_sem); + +	if (first) { +		struct raid_kobject *rkobj; +		int ret; + +		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); +		if (!rkobj) +			goto out_err; +		rkobj->raid_type = index; +		kobject_init(&rkobj->kobj, &btrfs_raid_ktype); +		ret = kobject_add(&rkobj->kobj, &space_info->kobj, +				  "%s", get_raid_name(index)); +		if (ret) { +			kobject_put(&rkobj->kobj); +			goto out_err; +		} +		space_info->block_group_kobjs[index] = &rkobj->kobj; +	} + +	return; +out_err: +	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ +	struct btrfs_block_group_cache *cache; + +	cache = kzalloc(sizeof(*cache), GFP_NOFS); +	if (!cache) +		return NULL; + +	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), +					GFP_NOFS); +	if (!cache->free_space_ctl) { +		kfree(cache); +		return NULL; +	} + +	cache->key.objectid = start; +	cache->key.offset = size; +	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + +	cache->sectorsize = root->sectorsize; +	cache->fs_info = root->fs_info; +	cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       start); +	atomic_set(&cache->count, 1); +	spin_lock_init(&cache->lock); +	init_rwsem(&cache->data_rwsem); +	INIT_LIST_HEAD(&cache->list); +	INIT_LIST_HEAD(&cache->cluster_list); +	INIT_LIST_HEAD(&cache->new_bg_list); +	btrfs_init_free_space_ctl(cache); + +	return cache;  }  int btrfs_read_block_groups(struct btrfs_root *root) @@ -8335,26 +8780,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)  			break;  		if (ret != 0)  			goto error; +  		leaf = path->nodes[0];  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		cache = kzalloc(sizeof(*cache), GFP_NOFS); + +		cache = btrfs_create_block_group_cache(root, found_key.objectid, +						       found_key.offset);  		if (!cache) {  			ret = -ENOMEM;  			goto error;  		} -		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), -						GFP_NOFS); -		if (!cache->free_space_ctl) { -			kfree(cache); -			ret = -ENOMEM; -			goto error; -		} - -		atomic_set(&cache->count, 1); -		spin_lock_init(&cache->lock); -		cache->fs_info = info; -		INIT_LIST_HEAD(&cache->list); -		INIT_LIST_HEAD(&cache->cluster_list);  		if (need_clear) {  			/* @@ -8375,16 +8810,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		read_extent_buffer(leaf, &cache->item,  				   btrfs_item_ptr_offset(leaf, path->slots[0]),  				   sizeof(cache->item)); -		memcpy(&cache->key, &found_key, sizeof(found_key)); +		cache->flags = btrfs_block_group_flags(&cache->item);  		key.objectid = found_key.objectid + found_key.offset;  		btrfs_release_path(path); -		cache->flags = btrfs_block_group_flags(&cache->item); -		cache->sectorsize = root->sectorsize; -		cache->full_stripe_len = btrfs_full_stripe_len(root, -					       &root->fs_info->mapping_tree, -					       found_key.objectid); -		btrfs_init_free_space_ctl(cache);  		/*  		 * We need to exclude the super stripes now so that the space @@ -8398,8 +8827,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)  			 * case.  			 */  			free_excluded_extents(root, cache); -			kfree(cache->free_space_ctl); -			kfree(cache); +			btrfs_put_block_group(cache);  			goto error;  		} @@ -8528,40 +8956,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	extent_root = root->fs_info->extent_root; -	root->fs_info->last_trans_log_full_commit = trans->transid; +	btrfs_set_log_full_commit(root->fs_info, trans); -	cache = kzalloc(sizeof(*cache), GFP_NOFS); +	cache = btrfs_create_block_group_cache(root, chunk_offset, size);  	if (!cache)  		return -ENOMEM; -	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), -					GFP_NOFS); -	if (!cache->free_space_ctl) { -		kfree(cache); -		return -ENOMEM; -	} - -	cache->key.objectid = chunk_offset; -	cache->key.offset = size; -	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; -	cache->sectorsize = root->sectorsize; -	cache->fs_info = root->fs_info; -	cache->full_stripe_len = btrfs_full_stripe_len(root, -					       &root->fs_info->mapping_tree, -					       chunk_offset); - -	atomic_set(&cache->count, 1); -	spin_lock_init(&cache->lock); -	INIT_LIST_HEAD(&cache->list); -	INIT_LIST_HEAD(&cache->cluster_list); -	INIT_LIST_HEAD(&cache->new_bg_list); - -	btrfs_init_free_space_ctl(cache);  	btrfs_set_block_group_used(&cache->item, bytes_used);  	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); -	cache->flags = type;  	btrfs_set_block_group_flags(&cache->item, type); +	cache->flags = type;  	cache->last_byte_to_unpin = (u64)-1;  	cache->cached = BTRFS_CACHE_FINISHED;  	ret = exclude_super_stripes(root, cache); @@ -8571,8 +8976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  		 * case.  		 */  		free_excluded_extents(root, cache); -		kfree(cache->free_space_ctl); -		kfree(cache); +		btrfs_put_block_group(cache);  		return ret;  	} @@ -8638,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	struct btrfs_root *tree_root = root->fs_info->tree_root;  	struct btrfs_key key;  	struct inode *inode; +	struct kobject *kobj = NULL;  	int ret;  	int index;  	int factor; @@ -8736,9 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	 * are still on the list after taking the semaphore  	 */  	list_del_init(&block_group->list); -	if (list_empty(&block_group->space_info->block_groups[index])) +	if (list_empty(&block_group->space_info->block_groups[index])) { +		kobj = block_group->space_info->block_group_kobjs[index]; +		block_group->space_info->block_group_kobjs[index] = NULL;  		clear_avail_alloc_bits(root->fs_info, block_group->flags); +	}  	up_write(&block_group->space_info->groups_sem); +	if (kobj) { +		kobject_del(kobj); +		kobject_put(kobj); +	}  	if (block_group->cached == BTRFS_CACHE_STARTED)  		wait_block_group_cache_done(block_group); @@ -8880,3 +9292,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)  	range->len = trimmed;  	return ret;  } + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ +	percpu_counter_dec(&root->subv_writers->counter); +	/* +	 * Make sure counter is updated before we wake up +	 * waiters. +	 */ +	smp_mb(); +	if (waitqueue_active(&root->subv_writers->wait)) +		wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ +	if (unlikely(atomic_read(&root->will_be_snapshoted))) +		return 0; + +	percpu_counter_inc(&root->subv_writers->counter); +	/* +	 * Make sure counter is updated before we check for snapshot creation. +	 */ +	smp_mb(); +	if (unlikely(atomic_read(&root->will_be_snapshoted))) { +		btrfs_end_nocow_write(root); +		return 0; +	} +	return 1; +}  | 
