diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 7224 | 
1 files changed, 4005 insertions, 3219 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c097f3aec4..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,27 +23,66 @@  #include <linux/rcupdate.h>  #include <linux/kthread.h>  #include <linux/slab.h> -#include "compat.h" +#include <linux/ratelimit.h> +#include <linux/percpu_counter.h>  #include "hash.h" -#include "ctree.h" +#include "tree-log.h"  #include "disk-io.h"  #include "print-tree.h" -#include "transaction.h"  #include "volumes.h" +#include "raid56.h"  #include "locking.h"  #include "free-space-cache.h" +#include "math.h" +#include "sysfs.h" +#include "qgroup.h" -static int update_block_group(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root, +#undef SCRAMBLE_DELAYED_REFS + +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated.  This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum { +	CHUNK_ALLOC_NO_FORCE = 0, +	CHUNK_ALLOC_LIMITED = 1, +	CHUNK_ALLOC_FORCE = 2, +}; + +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + *   ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + *   bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { +	RESERVE_FREE = 0, +	RESERVE_ALLOC = 1, +	RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + +static int update_block_group(struct btrfs_root *root,  			      u64 bytenr, u64 num_bytes, int alloc); -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, -				 u64 num_bytes, int reserve, int sinfo);  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				struct btrfs_root *root,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extra_op); +				struct btrfs_delayed_extent_op *extra_op, +				int no_quota);  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,  				    struct extent_buffer *leaf,  				    struct btrfs_extent_item *ei); @@ -56,20 +95,29 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins); +				     int level, struct btrfs_key *ins, +				     int no_quota);  static int do_chunk_alloc(struct btrfs_trans_handle *trans, -			  struct btrfs_root *extent_root, u64 alloc_bytes, -			  u64 flags, int force); +			  struct btrfs_root *extent_root, u64 flags, +			  int force);  static int find_next_key(struct btrfs_path *path, int level,  			 struct btrfs_key *key);  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  			    int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, +				       u64 num_bytes, int reserve, +				       int delalloc); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, +			       u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, +		     u64 bytenr, u64 num_bytes, int reserved);  static noinline int  block_group_cache_done(struct btrfs_block_group_cache *cache)  {  	smp_mb(); -	return cache->cached == BTRFS_CACHE_FINISHED; +	return cache->cached == BTRFS_CACHE_FINISHED || +		cache->cached == BTRFS_CACHE_ERROR;  }  static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -77,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)  	return (cache->flags & bits) == bits;  } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)  {  	atomic_inc(&cache->count);  } @@ -87,7 +135,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)  	if (atomic_dec_and_test(&cache->count)) {  		WARN_ON(cache->pinned > 0);  		WARN_ON(cache->reserved > 0); -		WARN_ON(cache->reserved_pinned > 0); +		kfree(cache->free_space_ctl);  		kfree(cache);  	}  } @@ -123,6 +171,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,  	rb_link_node(&block_group->cache_node, parent, p);  	rb_insert_color(&block_group->cache_node,  			&info->block_group_cache_tree); + +	if (info->first_logical_byte > block_group->key.objectid) +		info->first_logical_byte = block_group->key.objectid; +  	spin_unlock(&info->block_group_cache_lock);  	return 0; @@ -164,8 +216,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,  			break;  		}  	} -	if (ret) +	if (ret) {  		btrfs_get_block_group(ret); +		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) +			info->first_logical_byte = ret->key.objectid; +	}  	spin_unlock(&info->block_group_cache_lock);  	return ret; @@ -209,7 +264,8 @@ static int exclude_super_stripes(struct btrfs_root *root,  		cache->bytes_super += stripe_len;  		ret = add_excluded_extent(root, cache->key.objectid,  					  stripe_len); -		BUG_ON(ret); +		if (ret) +			return ret;  	}  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -217,13 +273,35 @@ static int exclude_super_stripes(struct btrfs_root *root,  		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,  				       cache->key.objectid, bytenr,  				       0, &logical, &nr, &stripe_len); -		BUG_ON(ret); +		if (ret) +			return ret;  		while (nr--) { -			cache->bytes_super += stripe_len; -			ret = add_excluded_extent(root, logical[nr], -						  stripe_len); -			BUG_ON(ret); +			u64 start, len; + +			if (logical[nr] > cache->key.objectid + +			    cache->key.offset) +				continue; + +			if (logical[nr] + stripe_len <= cache->key.objectid) +				continue; + +			start = logical[nr]; +			if (start < cache->key.objectid) { +				start = cache->key.objectid; +				len = (logical[nr] + stripe_len) - start; +			} else { +				len = min_t(u64, stripe_len, +					    cache->key.objectid + +					    cache->key.offset - start); +			} + +			cache->bytes_super += len; +			ret = add_excluded_extent(root, start, len); +			if (ret) { +				kfree(logical); +				return ret; +			}  		}  		kfree(logical); @@ -274,7 +352,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  	while (start < end) {  		ret = find_first_extent_bit(info->pinned_extents, start,  					    &extent_start, &extent_end, -					    EXTENT_DIRTY | EXTENT_UPTODATE); +					    EXTENT_DIRTY | EXTENT_UPTODATE, +					    NULL);  		if (ret)  			break; @@ -285,7 +364,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  			total_added += size;  			ret = btrfs_add_free_space(block_group, start,  						   size); -			BUG_ON(ret); +			BUG_ON(ret); /* -ENOMEM or logic error */  			start = extent_end + 1;  		} else {  			break; @@ -296,34 +375,34 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  		size = end - start;  		total_added += size;  		ret = btrfs_add_free_space(block_group, start, size); -		BUG_ON(ret); +		BUG_ON(ret); /* -ENOMEM or logic error */  	}  	return total_added;  } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work)  { -	struct btrfs_block_group_cache *block_group = data; -	struct btrfs_fs_info *fs_info = block_group->fs_info; -	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; -	struct btrfs_root *extent_root = fs_info->extent_root; +	struct btrfs_block_group_cache *block_group; +	struct btrfs_fs_info *fs_info; +	struct btrfs_caching_control *caching_ctl; +	struct btrfs_root *extent_root;  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_key key;  	u64 total_found = 0;  	u64 last = 0;  	u32 nritems; -	int ret = 0; +	int ret = -ENOMEM; + +	caching_ctl = container_of(work, struct btrfs_caching_control, work); +	block_group = caching_ctl->block_group; +	fs_info = block_group->fs_info; +	extent_root = fs_info->extent_root;  	path = btrfs_alloc_path();  	if (!path) -		return -ENOMEM; - -	exclude_super_stripes(extent_root, block_group); -	spin_lock(&block_group->space_info->lock); -	block_group->space_info->bytes_readonly += block_group->bytes_super; -	spin_unlock(&block_group->space_info->lock); +		goto out;  	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -335,7 +414,7 @@ static int caching_kthread(void *data)  	 */  	path->skip_locking = 1;  	path->search_commit_root = 1; -	path->reada = 2; +	path->reada = 1;  	key.objectid = last;  	key.offset = 0; @@ -343,8 +422,9 @@ static int caching_kthread(void *data)  again:  	mutex_lock(&caching_ctl->mutex);  	/* need to make sure the commit_root doesn't disappear */ -	down_read(&fs_info->extent_commit_sem); +	down_read(&fs_info->commit_root_sem); +next:  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);  	if (ret < 0)  		goto err; @@ -353,8 +433,7 @@ again:  	nritems = btrfs_header_nritems(leaf);  	while (1) { -		smp_mb(); -		if (fs_info->closing > 1) { +		if (btrfs_fs_closing(fs_info) > 1) {  			last = (u64)-1;  			break;  		} @@ -366,15 +445,34 @@ again:  			if (ret)  				break; -			caching_ctl->progress = last; -			btrfs_release_path(extent_root, path); -			up_read(&fs_info->extent_commit_sem); -			mutex_unlock(&caching_ctl->mutex); -			if (btrfs_transaction_in_commit(fs_info)) -				schedule_timeout(1); -			else +			if (need_resched() || +			    rwsem_is_contended(&fs_info->commit_root_sem)) { +				caching_ctl->progress = last; +				btrfs_release_path(path); +				up_read(&fs_info->commit_root_sem); +				mutex_unlock(&caching_ctl->mutex);  				cond_resched(); -			goto again; +				goto again; +			} + +			ret = btrfs_next_leaf(extent_root, path); +			if (ret < 0) +				goto err; +			if (ret) +				break; +			leaf = path->nodes[0]; +			nritems = btrfs_header_nritems(leaf); +			continue; +		} + +		if (key.objectid < last) { +			key.objectid = last; +			key.offset = 0; +			key.type = BTRFS_EXTENT_ITEM_KEY; + +			caching_ctl->progress = last; +			btrfs_release_path(path); +			goto next;  		}  		if (key.objectid < block_group->key.objectid) { @@ -386,11 +484,16 @@ again:  		    block_group->key.offset)  			break; -		if (key.type == BTRFS_EXTENT_ITEM_KEY) { +		if (key.type == BTRFS_EXTENT_ITEM_KEY || +		    key.type == BTRFS_METADATA_ITEM_KEY) {  			total_found += add_new_free_space(block_group,  							  fs_info, last,  							  key.objectid); -			last = key.objectid + key.offset; +			if (key.type == BTRFS_METADATA_ITEM_KEY) +				last = key.objectid + +					fs_info->tree_root->leafsize; +			else +				last = key.objectid + key.offset;  			if (total_found > (1024 * 1024 * 2)) {  				total_found = 0; @@ -413,98 +516,134 @@ again:  err:  	btrfs_free_path(path); -	up_read(&fs_info->extent_commit_sem); +	up_read(&fs_info->commit_root_sem);  	free_excluded_extents(extent_root, block_group);  	mutex_unlock(&caching_ctl->mutex); +out: +	if (ret) { +		spin_lock(&block_group->lock); +		block_group->caching_ctl = NULL; +		block_group->cached = BTRFS_CACHE_ERROR; +		spin_unlock(&block_group->lock); +	}  	wake_up(&caching_ctl->wait);  	put_caching_control(caching_ctl); -	atomic_dec(&block_group->space_info->caching_threads);  	btrfs_put_block_group(block_group); - -	return 0;  }  static int cache_block_group(struct btrfs_block_group_cache *cache, -			     struct btrfs_trans_handle *trans,  			     int load_cache_only)  { +	DEFINE_WAIT(wait);  	struct btrfs_fs_info *fs_info = cache->fs_info;  	struct btrfs_caching_control *caching_ctl; -	struct task_struct *tsk;  	int ret = 0; -	smp_mb(); -	if (cache->cached != BTRFS_CACHE_NO) -		return 0; +	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); +	if (!caching_ctl) +		return -ENOMEM; + +	INIT_LIST_HEAD(&caching_ctl->list); +	mutex_init(&caching_ctl->mutex); +	init_waitqueue_head(&caching_ctl->wait); +	caching_ctl->block_group = cache; +	caching_ctl->progress = cache->key.objectid; +	atomic_set(&caching_ctl->count, 1); +	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); +	spin_lock(&cache->lock);  	/* -	 * We can't do the read from on-disk cache during a commit since we need -	 * to have the normal tree locking. +	 * This should be a rare occasion, but this could happen I think in the +	 * case where one thread starts to load the space cache info, and then +	 * some other thread starts a transaction commit which tries to do an +	 * allocation while the other thread is still loading the space cache +	 * info.  The previous loop should have kept us from choosing this block +	 * group, but if we've moved to the state where we will wait on caching +	 * block groups we need to first check if we're doing a fast load here, +	 * so we can wait for it to finish, otherwise we could end up allocating +	 * from a block group who's cache gets evicted for one reason or +	 * another.  	 */ -	if (!trans->transaction->in_commit) { +	while (cache->cached == BTRFS_CACHE_FAST) { +		struct btrfs_caching_control *ctl; + +		ctl = cache->caching_ctl; +		atomic_inc(&ctl->count); +		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); +		spin_unlock(&cache->lock); + +		schedule(); + +		finish_wait(&ctl->wait, &wait); +		put_caching_control(ctl);  		spin_lock(&cache->lock); -		if (cache->cached != BTRFS_CACHE_NO) { -			spin_unlock(&cache->lock); -			return 0; -		} -		cache->cached = BTRFS_CACHE_STARTED; +	} + +	if (cache->cached != BTRFS_CACHE_NO) {  		spin_unlock(&cache->lock); +		kfree(caching_ctl); +		return 0; +	} +	WARN_ON(cache->caching_ctl); +	cache->caching_ctl = caching_ctl; +	cache->cached = BTRFS_CACHE_FAST; +	spin_unlock(&cache->lock); +	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {  		ret = load_free_space_cache(fs_info, cache);  		spin_lock(&cache->lock);  		if (ret == 1) { +			cache->caching_ctl = NULL;  			cache->cached = BTRFS_CACHE_FINISHED;  			cache->last_byte_to_unpin = (u64)-1;  		} else { -			cache->cached = BTRFS_CACHE_NO; +			if (load_cache_only) { +				cache->caching_ctl = NULL; +				cache->cached = BTRFS_CACHE_NO; +			} else { +				cache->cached = BTRFS_CACHE_STARTED; +			}  		}  		spin_unlock(&cache->lock); -		if (ret == 1) +		wake_up(&caching_ctl->wait); +		if (ret == 1) { +			put_caching_control(caching_ctl); +			free_excluded_extents(fs_info->extent_root, cache);  			return 0; +		} +	} else { +		/* +		 * We are not going to do the fast caching, set cached to the +		 * appropriate value and wakeup any waiters. +		 */ +		spin_lock(&cache->lock); +		if (load_cache_only) { +			cache->caching_ctl = NULL; +			cache->cached = BTRFS_CACHE_NO; +		} else { +			cache->cached = BTRFS_CACHE_STARTED; +		} +		spin_unlock(&cache->lock); +		wake_up(&caching_ctl->wait);  	} -	if (load_cache_only) -		return 0; - -	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); -	BUG_ON(!caching_ctl); - -	INIT_LIST_HEAD(&caching_ctl->list); -	mutex_init(&caching_ctl->mutex); -	init_waitqueue_head(&caching_ctl->wait); -	caching_ctl->block_group = cache; -	caching_ctl->progress = cache->key.objectid; -	/* one for caching kthread, one for caching block group list */ -	atomic_set(&caching_ctl->count, 2); - -	spin_lock(&cache->lock); -	if (cache->cached != BTRFS_CACHE_NO) { -		spin_unlock(&cache->lock); -		kfree(caching_ctl); +	if (load_cache_only) { +		put_caching_control(caching_ctl);  		return 0;  	} -	cache->caching_ctl = caching_ctl; -	cache->cached = BTRFS_CACHE_STARTED; -	spin_unlock(&cache->lock); -	down_write(&fs_info->extent_commit_sem); +	down_write(&fs_info->commit_root_sem); +	atomic_inc(&caching_ctl->count);  	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); -	up_write(&fs_info->extent_commit_sem); +	up_write(&fs_info->commit_root_sem); -	atomic_inc(&cache->space_info->caching_threads);  	btrfs_get_block_group(cache); -	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", -			  cache->key.objectid); -	if (IS_ERR(tsk)) { -		ret = PTR_ERR(tsk); -		printk(KERN_ERR "error running thread %d\n", ret); -		BUG(); -	} +	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);  	return ret;  } @@ -542,8 +681,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,  	struct list_head *head = &info->space_info;  	struct btrfs_space_info *found; -	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | -		 BTRFS_BLOCK_GROUP_METADATA; +	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;  	rcu_read_lock();  	list_for_each_entry_rcu(found, head, list) { @@ -571,73 +709,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)  	rcu_read_unlock();  } -static u64 div_factor(u64 num, int factor) -{ -	if (factor == 10) -		return num; -	num *= factor; -	do_div(num, 10); -	return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ -	if (factor == 100) -		return num; -	num *= factor; -	do_div(num, 100); -	return num; -} - -u64 btrfs_find_block_group(struct btrfs_root *root, -			   u64 search_start, u64 search_hint, int owner) -{ -	struct btrfs_block_group_cache *cache; -	u64 used; -	u64 last = max(search_hint, search_start); -	u64 group_start = 0; -	int full_search = 0; -	int factor = 9; -	int wrapped = 0; -again: -	while (1) { -		cache = btrfs_lookup_first_block_group(root->fs_info, last); -		if (!cache) -			break; - -		spin_lock(&cache->lock); -		last = cache->key.objectid + cache->key.offset; -		used = btrfs_block_group_used(&cache->item); - -		if ((full_search || !cache->ro) && -		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { -			if (used + cache->pinned + cache->reserved < -			    div_factor(cache->key.offset, factor)) { -				group_start = cache->key.objectid; -				spin_unlock(&cache->lock); -				btrfs_put_block_group(cache); -				goto found; -			} -		} -		spin_unlock(&cache->lock); -		btrfs_put_block_group(cache); -		cond_resched(); -	} -	if (!wrapped) { -		last = search_start; -		wrapped = 1; -		goto again; -	} -	if (!full_search && factor < 10) { -		last = search_start; -		full_search = 1; -		factor = 10; -		goto again; -	} -found: -	return group_start; -} -  /* simple helper to search for an existing extent at a given offset */  int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)  { @@ -646,18 +717,26 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)  	struct btrfs_path *path;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; +  	key.objectid = start;  	key.offset = len; -	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); +	key.type = BTRFS_EXTENT_ITEM_KEY;  	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,  				0, 0); +	if (ret > 0) { +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +		if (key.objectid == start && +		    key.type == BTRFS_METADATA_ITEM_KEY) +			ret = 0; +	}  	btrfs_free_path(path);  	return ret;  }  /* - * helper function to lookup reference count and flags of extent. + * helper function to lookup reference count and flags of a tree block.   *   * the head node for delayed ref is used to store the sum of all the   * reference count modifications queued up in the rbtree. the head @@ -667,7 +746,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)   */  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root, u64 bytenr, -			     u64 num_bytes, u64 *refs, u64 *flags) +			     u64 offset, int metadata, u64 *refs, u64 *flags)  {  	struct btrfs_delayed_ref_head *head;  	struct btrfs_delayed_ref_root *delayed_refs; @@ -680,23 +759,57 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  	u64 extent_flags;  	int ret; +	/* +	 * If we don't have skinny metadata, don't bother doing anything +	 * different +	 */ +	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { +		offset = root->leafsize; +		metadata = 0; +	} +  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	key.objectid = bytenr; -	key.type = BTRFS_EXTENT_ITEM_KEY; -	key.offset = num_bytes;  	if (!trans) {  		path->skip_locking = 1;  		path->search_commit_root = 1;  	} + +search_again: +	key.objectid = bytenr; +	key.offset = offset; +	if (metadata) +		key.type = BTRFS_METADATA_ITEM_KEY; +	else +		key.type = BTRFS_EXTENT_ITEM_KEY; +  again:  	ret = btrfs_search_slot(trans, root->fs_info->extent_root,  				&key, path, 0, 0);  	if (ret < 0)  		goto out_free; +	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { +		if (path->slots[0]) { +			path->slots[0]--; +			btrfs_item_key_to_cpu(path->nodes[0], &key, +					      path->slots[0]); +			if (key.objectid == bytenr && +			    key.type == BTRFS_EXTENT_ITEM_KEY && +			    key.offset == root->leafsize) +				ret = 0; +		} +		if (ret) { +			key.objectid = bytenr; +			key.type = BTRFS_EXTENT_ITEM_KEY; +			key.offset = root->leafsize; +			btrfs_release_path(path); +			goto again; +		} +	} +  	if (ret == 0) {  		leaf = path->nodes[0];  		item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -736,19 +849,25 @@ again:  			atomic_inc(&head->node.refs);  			spin_unlock(&delayed_refs->lock); -			btrfs_release_path(root->fs_info->extent_root, path); +			btrfs_release_path(path); +			/* +			 * Mutex was contended, block until it's released and try +			 * again +			 */  			mutex_lock(&head->mutex);  			mutex_unlock(&head->mutex);  			btrfs_put_delayed_ref(&head->node); -			goto again; +			goto search_again;  		} +		spin_lock(&head->lock);  		if (head->extent_op && head->extent_op->update_flags)  			extent_flags |= head->extent_op->flags_to_set;  		else  			BUG_ON(num_refs == 0);  		num_refs += head->node.ref_mod; +		spin_unlock(&head->lock);  		mutex_unlock(&head->mutex);  	}  	spin_unlock(&delayed_refs->lock); @@ -900,7 +1019,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,  				ret = btrfs_next_leaf(root, path);  				if (ret < 0)  					return ret; -				BUG_ON(ret > 0); +				BUG_ON(ret > 0); /* Corruption */  				leaf = path->nodes[0];  			}  			btrfs_item_key_to_cpu(leaf, &found_key, @@ -916,7 +1035,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,  			break;  		}  	} -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	if (owner < BTRFS_FIRST_FREE_OBJECTID)  		new_size += sizeof(*bi); @@ -926,10 +1045,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,  				new_size + extra_size, 1);  	if (ret < 0)  		return ret; -	BUG_ON(ret); +	BUG_ON(ret); /* Corruption */ -	ret = btrfs_extend_item(trans, root, path, new_size); -	BUG_ON(ret); +	btrfs_extend_item(root, path, new_size);  	leaf = path->nodes[0];  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -959,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)  	__le64 lenum;  	lenum = cpu_to_le64(root_objectid); -	high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); +	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));  	lenum = cpu_to_le64(owner); -	low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); +	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));  	lenum = cpu_to_le64(offset); -	low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); +	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));  	return ((u64)high_crc << 31) ^ (u64)low_crc;  } @@ -1024,7 +1142,7 @@ again:  			return 0;  #ifdef BTRFS_COMPAT_EXTENT_TREE_V0  		key.type = BTRFS_EXTENT_REF_V0_KEY; -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);  		if (ret < 0) {  			err = ret; @@ -1062,7 +1180,7 @@ again:  		if (match_extent_data_ref(leaf, ref, root_objectid,  					  owner, offset)) {  			if (recow) { -				btrfs_release_path(root, path); +				btrfs_release_path(path);  				goto again;  			}  			err = 0; @@ -1123,7 +1241,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,  			if (match_extent_data_ref(leaf, ref, root_objectid,  						  owner, offset))  				break; -			btrfs_release_path(root, path); +			btrfs_release_path(path);  			key.offset++;  			ret = btrfs_insert_empty_item(trans, root, path, &key,  						      size); @@ -1149,14 +1267,14 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	ret = 0;  fail: -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	return ret;  }  static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  					   struct btrfs_root *root,  					   struct btrfs_path *path, -					   int refs_to_drop) +					   int refs_to_drop, int *last_ref)  {  	struct btrfs_key key;  	struct btrfs_extent_data_ref *ref1 = NULL; @@ -1192,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  	if (num_refs == 0) {  		ret = btrfs_del_item(trans, root, path); +		*last_ref = 1;  	} else {  		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1275,7 +1394,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,  		ret = -ENOENT;  #ifdef BTRFS_COMPAT_EXTENT_TREE_V0  	if (ret == -ENOENT && parent) { -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		key.type = BTRFS_EXTENT_REF_V0_KEY;  		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);  		if (ret > 0) @@ -1304,7 +1423,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,  	}  	ret = btrfs_insert_empty_item(trans, root, path, &key, 0); -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	return ret;  } @@ -1381,6 +1500,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,  	int want;  	int ret;  	int err = 0; +	bool skinny_metadata = btrfs_fs_incompat(root->fs_info, +						 SKINNY_METADATA);  	key.objectid = bytenr;  	key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1392,12 +1513,54 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,  		path->keep_locks = 1;  	} else  		extra_size = -1; + +	/* +	 * Owner is our parent level, so we can just add one to get the level +	 * for the block we are interested in. +	 */ +	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { +		key.type = BTRFS_METADATA_ITEM_KEY; +		key.offset = owner; +	} + +again:  	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);  	if (ret < 0) {  		err = ret;  		goto out;  	} -	BUG_ON(ret); + +	/* +	 * We may be a newly converted file system which still has the old fat +	 * extent entries for metadata, so try and see if we have one of those. +	 */ +	if (ret > 0 && skinny_metadata) { +		skinny_metadata = false; +		if (path->slots[0]) { +			path->slots[0]--; +			btrfs_item_key_to_cpu(path->nodes[0], &key, +					      path->slots[0]); +			if (key.objectid == bytenr && +			    key.type == BTRFS_EXTENT_ITEM_KEY && +			    key.offset == num_bytes) +				ret = 0; +		} +		if (ret) { +			key.objectid = bytenr; +			key.type = BTRFS_EXTENT_ITEM_KEY; +			key.offset = num_bytes; +			btrfs_release_path(path); +			goto again; +		} +	} + +	if (ret && !insert) { +		err = -ENOENT; +		goto out; +	} else if (WARN_ON(ret)) { +		err = -EIO; +		goto out; +	}  	leaf = path->nodes[0];  	item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1425,11 +1588,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,  	ptr = (unsigned long)(ei + 1);  	end = (unsigned long)ei + item_size; -	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {  		ptr += sizeof(struct btrfs_tree_block_info);  		BUG_ON(ptr > end); -	} else { -		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));  	}  	err = -ENOENT; @@ -1511,13 +1672,12 @@ out:   * helper to add new inline back ref   */  static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, -				struct btrfs_root *root, -				struct btrfs_path *path, -				struct btrfs_extent_inline_ref *iref, -				u64 parent, u64 root_objectid, -				u64 owner, u64 offset, int refs_to_add, -				struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_root *root, +				 struct btrfs_path *path, +				 struct btrfs_extent_inline_ref *iref, +				 u64 parent, u64 root_objectid, +				 u64 owner, u64 offset, int refs_to_add, +				 struct btrfs_delayed_extent_op *extent_op)  {  	struct extent_buffer *leaf;  	struct btrfs_extent_item *ei; @@ -1527,7 +1687,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,  	u64 refs;  	int size;  	int type; -	int ret;  	leaf = path->nodes[0];  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1536,8 +1695,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,  	type = extent_ref_type(parent, owner);  	size = btrfs_extent_inline_ref_size(type); -	ret = btrfs_extend_item(trans, root, path, size); -	BUG_ON(ret); +	btrfs_extend_item(root, path, size);  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	refs = btrfs_extent_refs(leaf, ei); @@ -1572,7 +1730,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,  		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);  	}  	btrfs_mark_buffer_dirty(leaf); -	return 0;  }  static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1590,7 +1747,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,  	if (ret != -ENOENT)  		return ret; -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	*ref_ret = NULL;  	if (owner < BTRFS_FIRST_FREE_OBJECTID) { @@ -1607,12 +1764,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,   * helper to update/remove inline back ref   */  static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root, -				 struct btrfs_path *path, -				 struct btrfs_extent_inline_ref *iref, -				 int refs_to_mod, -				 struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_root *root, +				  struct btrfs_path *path, +				  struct btrfs_extent_inline_ref *iref, +				  int refs_to_mod, +				  struct btrfs_delayed_extent_op *extent_op, +				  int *last_ref)  {  	struct extent_buffer *leaf;  	struct btrfs_extent_item *ei; @@ -1623,7 +1780,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,  	u32 item_size;  	int size;  	int type; -	int ret;  	u64 refs;  	leaf = path->nodes[0]; @@ -1657,6 +1813,7 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,  		else  			btrfs_set_shared_data_ref_count(leaf, sref, refs);  	} else { +		*last_ref = 1;  		size =  btrfs_extent_inline_ref_size(type);  		item_size = btrfs_item_size_nr(leaf, path->slots[0]);  		ptr = (unsigned long)iref; @@ -1665,11 +1822,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,  			memmove_extent_buffer(leaf, ptr, ptr + size,  					      end - ptr - size);  		item_size -= size; -		ret = btrfs_truncate_item(trans, root, path, item_size, 1); -		BUG_ON(ret); +		btrfs_truncate_item(root, path, item_size, 1);  	}  	btrfs_mark_buffer_dirty(leaf); -	return 0;  }  static noinline_for_stack @@ -1689,13 +1844,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,  					   root_objectid, owner, offset, 1);  	if (ret == 0) {  		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); -		ret = update_inline_extent_backref(trans, root, path, iref, -						   refs_to_add, extent_op); +		update_inline_extent_backref(root, path, iref, +					     refs_to_add, extent_op, NULL);  	} else if (ret == -ENOENT) { -		ret = setup_inline_extent_backref(trans, root, path, iref, -						  parent, root_objectid, -						  owner, offset, refs_to_add, -						  extent_op); +		setup_inline_extent_backref(root, path, iref, parent, +					    root_objectid, owner, offset, +					    refs_to_add, extent_op); +		ret = 0;  	}  	return ret;  } @@ -1723,76 +1878,101 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct btrfs_path *path,  				 struct btrfs_extent_inline_ref *iref, -				 int refs_to_drop, int is_data) +				 int refs_to_drop, int is_data, int *last_ref)  { -	int ret; +	int ret = 0;  	BUG_ON(!is_data && refs_to_drop != 1);  	if (iref) { -		ret = update_inline_extent_backref(trans, root, path, iref, -						   -refs_to_drop, NULL); +		update_inline_extent_backref(root, path, iref, +					     -refs_to_drop, NULL, last_ref);  	} else if (is_data) { -		ret = remove_extent_data_ref(trans, root, path, refs_to_drop); +		ret = remove_extent_data_ref(trans, root, path, refs_to_drop, +					     last_ref);  	} else { +		*last_ref = 1;  		ret = btrfs_del_item(trans, root, path);  	}  	return ret;  } -static void btrfs_issue_discard(struct block_device *bdev, +static int btrfs_issue_discard(struct block_device *bdev,  				u64 start, u64 len)  { -	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); +	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);  }  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, -				u64 num_bytes) +				u64 num_bytes, u64 *actual_bytes)  {  	int ret; -	u64 map_length = num_bytes; -	struct btrfs_multi_bio *multi = NULL; +	u64 discarded_bytes = 0; +	struct btrfs_bio *bbio = NULL; -	if (!btrfs_test_opt(root, DISCARD)) -		return 0;  	/* Tell the block device(s) that the sectors can be discarded */ -	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, -			      bytenr, &map_length, &multi, 0); +	ret = btrfs_map_block(root->fs_info, REQ_DISCARD, +			      bytenr, &num_bytes, &bbio, 0); +	/* Error condition is -ENOMEM */  	if (!ret) { -		struct btrfs_bio_stripe *stripe = multi->stripes; +		struct btrfs_bio_stripe *stripe = bbio->stripes;  		int i; -		if (map_length > num_bytes) -			map_length = num_bytes; -		for (i = 0; i < multi->num_stripes; i++, stripe++) { -			btrfs_issue_discard(stripe->dev->bdev, -					    stripe->physical, -					    map_length); +		for (i = 0; i < bbio->num_stripes; i++, stripe++) { +			if (!stripe->dev->can_discard) +				continue; + +			ret = btrfs_issue_discard(stripe->dev->bdev, +						  stripe->physical, +						  stripe->length); +			if (!ret) +				discarded_bytes += stripe->length; +			else if (ret != -EOPNOTSUPP) +				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + +			/* +			 * Just in case we get back EOPNOTSUPP for some reason, +			 * just ignore the return value so we don't screw up +			 * people calling discard_extent. +			 */ +			ret = 0;  		} -		kfree(multi); +		kfree(bbio);  	} +	if (actual_bytes) +		*actual_bytes = discarded_bytes; + + +	if (ret == -EOPNOTSUPP) +		ret = 0;  	return ret;  } +/* Can return -ENOMEM */  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  			 struct btrfs_root *root,  			 u64 bytenr, u64 num_bytes, u64 parent, -			 u64 root_objectid, u64 owner, u64 offset) +			 u64 root_objectid, u64 owner, u64 offset, +			 int no_quota)  {  	int ret; +	struct btrfs_fs_info *fs_info = root->fs_info; +  	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&  	       root_objectid == BTRFS_TREE_LOG_OBJECTID);  	if (owner < BTRFS_FIRST_FREE_OBJECTID) { -		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, +		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, +					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_ADD_DELAYED_REF, NULL); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	} else { -		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, +		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, +					num_bytes,  					parent, root_objectid, owner, offset, -					BTRFS_ADD_DELAYED_REF, NULL); +					BTRFS_ADD_DELAYED_REF, NULL, no_quota);  	}  	return ret;  } @@ -1802,55 +1982,89 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  				  u64 bytenr, u64 num_bytes,  				  u64 parent, u64 root_objectid,  				  u64 owner, u64 offset, int refs_to_add, +				  int no_quota,  				  struct btrfs_delayed_extent_op *extent_op)  { +	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_extent_item *item; +	struct btrfs_key key;  	u64 refs;  	int ret; -	int err = 0; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; +	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) +		no_quota = 1; +  	path->reada = 1;  	path->leave_spinning = 1;  	/* this will setup the path even if it fails to insert the back ref */ -	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, -					   path, bytenr, num_bytes, parent, +	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, +					   bytenr, num_bytes, parent,  					   root_objectid, owner, offset,  					   refs_to_add, extent_op); -	if (ret == 0) +	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))  		goto out; - -	if (ret != -EAGAIN) { -		err = ret; +	/* +	 * Ok we were able to insert an inline extent and it appears to be a new +	 * reference, deal with the qgroup accounting. +	 */ +	if (!ret && !no_quota) { +		ASSERT(root->fs_info->quota_enabled); +		leaf = path->nodes[0]; +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		item = btrfs_item_ptr(leaf, path->slots[0], +				      struct btrfs_extent_item); +		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) +			type = BTRFS_QGROUP_OPER_ADD_SHARED; +		btrfs_release_path(path); + +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0);  		goto out;  	} +	/* +	 * Ok we had -EAGAIN which means we didn't have space to insert and +	 * inline extent ref, so just update the reference count and add a +	 * normal backref. +	 */  	leaf = path->nodes[0]; +	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	refs = btrfs_extent_refs(leaf, item); +	if (refs) +		type = BTRFS_QGROUP_OPER_ADD_SHARED;  	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);  	if (extent_op)  		__run_delayed_extent_op(extent_op, leaf, item);  	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(root->fs_info->extent_root, path); +	btrfs_release_path(path); + +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      bytenr, num_bytes, type, 0); +		if (ret) +			goto out; +	}  	path->reada = 1;  	path->leave_spinning = 1; -  	/* now insert the actual backref */  	ret = insert_extent_backref(trans, root->fs_info->extent_root,  				    path, bytenr, parent, root_objectid,  				    owner, offset, refs_to_add); -	BUG_ON(ret); +	if (ret) +		btrfs_abort_transaction(trans, root, ret);  out:  	btrfs_free_path(path); -	return err; +	return ret;  }  static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -1871,16 +2085,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  	ins.type = BTRFS_EXTENT_ITEM_KEY;  	ref = btrfs_delayed_node_to_data_ref(node); +	trace_run_delayed_data_ref(node, ref, node->action); +  	if (node->type == BTRFS_SHARED_DATA_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root;  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { -		if (extent_op) { -			BUG_ON(extent_op->update_key); +		if (extent_op)  			flags |= extent_op->flags_to_set; -		}  		ret = alloc_reserved_file_extent(trans, root,  						 parent, ref_root, flags,  						 ref->objectid, ref->offset, @@ -1890,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,  					     node->num_bytes, parent,  					     ref_root, ref->objectid,  					     ref->offset, node->ref_mod, -					     extent_op); +					     node->no_quota, extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent,  					  ref_root, ref->objectid,  					  ref->offset, node->ref_mod, -					  extent_op); +					  extent_op, node->no_quota);  	} else {  		BUG();  	} @@ -1933,15 +2146,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,  	u32 item_size;  	int ret;  	int err = 0; +	int metadata = !extent_op->is_data; + +	if (trans->aborted) +		return 0; + +	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) +		metadata = 0;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM;  	key.objectid = node->bytenr; -	key.type = BTRFS_EXTENT_ITEM_KEY; -	key.offset = node->num_bytes; +	if (metadata) { +		key.type = BTRFS_METADATA_ITEM_KEY; +		key.offset = extent_op->level; +	} else { +		key.type = BTRFS_EXTENT_ITEM_KEY; +		key.offset = node->num_bytes; +	} + +again:  	path->reada = 1;  	path->leave_spinning = 1;  	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, @@ -1951,8 +2178,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,  		goto out;  	}  	if (ret > 0) { -		err = -EIO; -		goto out; +		if (metadata) { +			if (path->slots[0] > 0) { +				path->slots[0]--; +				btrfs_item_key_to_cpu(path->nodes[0], &key, +						      path->slots[0]); +				if (key.objectid == node->bytenr && +				    key.type == BTRFS_EXTENT_ITEM_KEY && +				    key.offset == node->num_bytes) +					ret = 0; +			} +			if (ret > 0) { +				btrfs_release_path(path); +				metadata = 0; + +				key.objectid = node->bytenr; +				key.offset = node->num_bytes; +				key.type = BTRFS_EXTENT_ITEM_KEY; +				goto again; +			} +		} else { +			err = -EIO; +			goto out; +		}  	}  	leaf = path->nodes[0]; @@ -1990,34 +2238,44 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,  	struct btrfs_key ins;  	u64 parent = 0;  	u64 ref_root = 0; - -	ins.objectid = node->bytenr; -	ins.offset = node->num_bytes; -	ins.type = BTRFS_EXTENT_ITEM_KEY; +	bool skinny_metadata = btrfs_fs_incompat(root->fs_info, +						 SKINNY_METADATA);  	ref = btrfs_delayed_node_to_tree_ref(node); +	trace_run_delayed_tree_ref(node, ref, node->action); +  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)  		parent = ref->parent; -	else -		ref_root = ref->root; +	ref_root = ref->root; + +	ins.objectid = node->bytenr; +	if (skinny_metadata) { +		ins.offset = ref->level; +		ins.type = BTRFS_METADATA_ITEM_KEY; +	} else { +		ins.offset = node->num_bytes; +		ins.type = BTRFS_EXTENT_ITEM_KEY; +	}  	BUG_ON(node->ref_mod != 1);  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { -		BUG_ON(!extent_op || !extent_op->update_flags || -		       !extent_op->update_key); +		BUG_ON(!extent_op || !extent_op->update_flags);  		ret = alloc_reserved_tree_block(trans, root,  						parent, ref_root,  						extent_op->flags_to_set,  						&extent_op->key, -						ref->level, &ins); +						ref->level, &ins, +						node->no_quota);  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {  		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,  					     node->num_bytes, parent, ref_root, -					     ref->level, 0, 1, extent_op); +					     ref->level, 0, 1, node->no_quota, +					     extent_op);  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {  		ret = __btrfs_free_extent(trans, root, node->bytenr,  					  node->num_bytes, parent, ref_root, -					  ref->level, 0, 1, extent_op); +					  ref->level, 0, 1, extent_op, +					  node->no_quota);  	} else {  		BUG();  	} @@ -2031,7 +2289,15 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  			       struct btrfs_delayed_extent_op *extent_op,  			       int insert_reserved)  { -	int ret; +	int ret = 0; + +	if (trans->aborted) { +		if (insert_reserved) +			btrfs_pin_extent(root, node->bytenr, +					 node->num_bytes, 1); +		return 0; +	} +  	if (btrfs_delayed_ref_is_head(node)) {  		struct btrfs_delayed_ref_head *head;  		/* @@ -2042,6 +2308,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  		 */  		BUG_ON(extent_op);  		head = btrfs_delayed_node_to_head(node); +		trace_run_delayed_ref_head(node, head, node->action); +  		if (insert_reserved) {  			btrfs_pin_extent(root, node->bytenr,  					 node->num_bytes, 1); @@ -2049,11 +2317,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  				ret = btrfs_del_csums(trans, root,  						      node->bytenr,  						      node->num_bytes); -				BUG_ON(ret);  			}  		} -		mutex_unlock(&head->mutex); -		return 0; +		return ret;  	}  	if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -2073,59 +2339,62 @@ static noinline struct btrfs_delayed_ref_node *  select_delayed_ref(struct btrfs_delayed_ref_head *head)  {  	struct rb_node *node; -	struct btrfs_delayed_ref_node *ref; -	int action = BTRFS_ADD_DELAYED_REF; -again: +	struct btrfs_delayed_ref_node *ref, *last = NULL;; +  	/*  	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.  	 * this prevents ref count from going down to zero when  	 * there still are pending delayed ref.  	 */ -	node = rb_prev(&head->node.rb_node); -	while (1) { -		if (!node) -			break; +	node = rb_first(&head->ref_root); +	while (node) {  		ref = rb_entry(node, struct btrfs_delayed_ref_node,  				rb_node); -		if (ref->bytenr != head->node.bytenr) -			break; -		if (ref->action == action) +		if (ref->action == BTRFS_ADD_DELAYED_REF)  			return ref; -		node = rb_prev(node); +		else if (last == NULL) +			last = ref; +		node = rb_next(node);  	} -	if (action == BTRFS_ADD_DELAYED_REF) { -		action = BTRFS_DROP_DELAYED_REF; -		goto again; -	} -	return NULL; +	return last;  } -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, -				       struct btrfs_root *root, -				       struct list_head *cluster) +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, +					     struct btrfs_root *root, +					     unsigned long nr)  {  	struct btrfs_delayed_ref_root *delayed_refs;  	struct btrfs_delayed_ref_node *ref;  	struct btrfs_delayed_ref_head *locked_ref = NULL;  	struct btrfs_delayed_extent_op *extent_op; +	struct btrfs_fs_info *fs_info = root->fs_info; +	ktime_t start = ktime_get();  	int ret; -	int count = 0; +	unsigned long count = 0; +	unsigned long actual_count = 0;  	int must_insert_reserved = 0;  	delayed_refs = &trans->transaction->delayed_refs;  	while (1) {  		if (!locked_ref) { -			/* pick a new head ref from the cluster list */ -			if (list_empty(cluster)) +			if (count >= nr)  				break; -			locked_ref = list_entry(cluster->next, -				     struct btrfs_delayed_ref_head, cluster); +			spin_lock(&delayed_refs->lock); +			locked_ref = btrfs_select_ref_head(trans); +			if (!locked_ref) { +				spin_unlock(&delayed_refs->lock); +				break; +			}  			/* grab the lock that says we are going to process  			 * all the refs for this head */  			ret = btrfs_delayed_ref_lock(trans, locked_ref); - +			spin_unlock(&delayed_refs->lock);  			/*  			 * we may have dropped the spin lock to get the head  			 * mutex lock, and that might have given someone else @@ -2140,6 +2409,37 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		}  		/* +		 * We need to try and merge add/drops of the same ref since we +		 * can run into issues with relocate dropping the implicit ref +		 * and then it being added back again before the drop can +		 * finish.  If we merged anything we need to re-loop so we can +		 * get a good ref. +		 */ +		spin_lock(&locked_ref->lock); +		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, +					 locked_ref); + +		/* +		 * locked_ref is the head node, so we have to go one +		 * node back for any delayed ref updates +		 */ +		ref = select_delayed_ref(locked_ref); + +		if (ref && ref->seq && +		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { +			spin_unlock(&locked_ref->lock); +			btrfs_delayed_ref_unlock(locked_ref); +			spin_lock(&delayed_refs->lock); +			locked_ref->processing = 0; +			delayed_refs->num_heads_ready++; +			spin_unlock(&delayed_refs->lock); +			locked_ref = NULL; +			cond_resched(); +			count++; +			continue; +		} + +		/*  		 * record the must insert reserved flag before we  		 * drop the spin lock.  		 */ @@ -2149,12 +2449,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		extent_op = locked_ref->extent_op;  		locked_ref->extent_op = NULL; -		/* -		 * locked_ref is the head node, so we have to go one -		 * node back for any delayed ref updates -		 */ -		ref = select_delayed_ref(locked_ref);  		if (!ref) { + +  			/* All delayed refs have been processed, Go ahead  			 * and send the head node to run_one_delayed_ref,  			 * so that any accounting fixes can happen @@ -2162,45 +2459,308 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  			ref = &locked_ref->node;  			if (extent_op && must_insert_reserved) { -				kfree(extent_op); +				btrfs_free_delayed_extent_op(extent_op);  				extent_op = NULL;  			}  			if (extent_op) { -				spin_unlock(&delayed_refs->lock); - +				spin_unlock(&locked_ref->lock);  				ret = run_delayed_extent_op(trans, root,  							    ref, extent_op); -				BUG_ON(ret); -				kfree(extent_op); - -				cond_resched(); -				spin_lock(&delayed_refs->lock); +				btrfs_free_delayed_extent_op(extent_op); + +				if (ret) { +					/* +					 * Need to reset must_insert_reserved if +					 * there was an error so the abort stuff +					 * can cleanup the reserved space +					 * properly. +					 */ +					if (must_insert_reserved) +						locked_ref->must_insert_reserved = 1; +					locked_ref->processing = 0; +					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); +					btrfs_delayed_ref_unlock(locked_ref); +					return ret; +				}  				continue;  			} -			list_del_init(&locked_ref->cluster); -			locked_ref = NULL; +			/* +			 * Need to drop our head ref lock and re-aqcuire the +			 * delayed ref lock and then re-check to make sure +			 * nobody got added. +			 */ +			spin_unlock(&locked_ref->lock); +			spin_lock(&delayed_refs->lock); +			spin_lock(&locked_ref->lock); +			if (rb_first(&locked_ref->ref_root) || +			    locked_ref->extent_op) { +				spin_unlock(&locked_ref->lock); +				spin_unlock(&delayed_refs->lock); +				continue; +			} +			ref->in_tree = 0; +			delayed_refs->num_heads--; +			rb_erase(&locked_ref->href_node, +				 &delayed_refs->href_root); +			spin_unlock(&delayed_refs->lock); +		} else { +			actual_count++; +			ref->in_tree = 0; +			rb_erase(&ref->rb_node, &locked_ref->ref_root);  		} +		atomic_dec(&delayed_refs->num_entries); -		ref->in_tree = 0; -		rb_erase(&ref->rb_node, &delayed_refs->root); -		delayed_refs->num_entries--; - -		spin_unlock(&delayed_refs->lock); +		if (!btrfs_delayed_ref_is_head(ref)) { +			/* +			 * when we play the delayed ref, also correct the +			 * ref_mod on head +			 */ +			switch (ref->action) { +			case BTRFS_ADD_DELAYED_REF: +			case BTRFS_ADD_DELAYED_EXTENT: +				locked_ref->node.ref_mod -= ref->ref_mod; +				break; +			case BTRFS_DROP_DELAYED_REF: +				locked_ref->node.ref_mod += ref->ref_mod; +				break; +			default: +				WARN_ON(1); +			} +		} +		spin_unlock(&locked_ref->lock);  		ret = run_one_delayed_ref(trans, root, ref, extent_op,  					  must_insert_reserved); -		BUG_ON(ret); +		btrfs_free_delayed_extent_op(extent_op); +		if (ret) { +			locked_ref->processing = 0; +			btrfs_delayed_ref_unlock(locked_ref); +			btrfs_put_delayed_ref(ref); +			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); +			return ret; +		} + +		/* +		 * If this node is a head, that means all the refs in this head +		 * have been dealt with, and we will pick the next head to deal +		 * with, so we must unlock the head and drop it from the cluster +		 * list before we release it. +		 */ +		if (btrfs_delayed_ref_is_head(ref)) { +			btrfs_delayed_ref_unlock(locked_ref); +			locked_ref = NULL; +		}  		btrfs_put_delayed_ref(ref); -		kfree(extent_op);  		count++; -  		cond_resched(); +	} + +	/* +	 * We don't want to include ref heads since we can have empty ref heads +	 * and those will drastically skew our runtime down since we just do +	 * accounting, no actual extent tree updates. +	 */ +	if (actual_count > 0) { +		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); +		u64 avg; + +		/* +		 * We weigh the current average higher than our current runtime +		 * to avoid large swings in the average. +		 */  		spin_lock(&delayed_refs->lock); +		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; +		avg = div64_u64(avg, 4); +		fs_info->avg_delayed_ref_runtime = avg; +		spin_unlock(&delayed_refs->lock); +	} +	return 0; +} + +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ +	struct rb_node *n = root->rb_node; +	struct btrfs_delayed_ref_node *entry; +	int alt = 1; +	u64 middle; +	u64 first = 0, last = 0; + +	n = rb_first(root); +	if (n) { +		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); +		first = entry->bytenr; +	} +	n = rb_last(root); +	if (n) { +		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); +		last = entry->bytenr; +	} +	n = root->rb_node; + +	while (n) { +		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); +		WARN_ON(!entry->in_tree); + +		middle = entry->bytenr; + +		if (alt) +			n = n->rb_left; +		else +			n = n->rb_right; + +		alt = 1 - alt; +	} +	return middle; +} +#endif + +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ +	u64 num_bytes; + +	num_bytes = heads * (sizeof(struct btrfs_extent_item) + +			     sizeof(struct btrfs_extent_inline_ref)); +	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) +		num_bytes += heads * sizeof(struct btrfs_tree_block_info); + +	/* +	 * We don't ever fill up leaves all the way so multiply by 2 just to be +	 * closer to what we're really going to want to ouse. +	 */ +	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root) +{ +	struct btrfs_block_rsv *global_rsv; +	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; +	u64 num_bytes; +	int ret = 0; + +	num_bytes = btrfs_calc_trans_metadata_size(root, 1); +	num_heads = heads_to_leaves(root, num_heads); +	if (num_heads > 1) +		num_bytes += (num_heads - 1) * root->leafsize; +	num_bytes <<= 1; +	global_rsv = &root->fs_info->global_block_rsv; + +	/* +	 * If we can't allocate any more chunks lets make sure we have _lots_ of +	 * wiggle room since running delayed refs can create more delayed refs. +	 */ +	if (global_rsv->space_info->full) +		num_bytes <<= 1; + +	spin_lock(&global_rsv->lock); +	if (global_rsv->reserved <= num_bytes) +		ret = 1; +	spin_unlock(&global_rsv->lock); +	return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root) +{ +	struct btrfs_fs_info *fs_info = root->fs_info; +	u64 num_entries = +		atomic_read(&trans->transaction->delayed_refs.num_entries); +	u64 avg_runtime; +	u64 val; + +	smp_mb(); +	avg_runtime = fs_info->avg_delayed_ref_runtime; +	val = num_entries * avg_runtime; +	if (num_entries * avg_runtime >= NSEC_PER_SEC) +		return 1; +	if (val >= NSEC_PER_SEC / 2) +		return 2; + +	return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { +	struct btrfs_root *root; +	int count; +	int error; +	int sync; +	struct completion wait; +	struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ +	struct async_delayed_refs *async; +	struct btrfs_trans_handle *trans; +	int ret; + +	async = container_of(work, struct async_delayed_refs, work); + +	trans = btrfs_join_transaction(async->root); +	if (IS_ERR(trans)) { +		async->error = PTR_ERR(trans); +		goto done; +	} + +	/* +	 * trans->sync means that when we call end_transaciton, we won't +	 * wait on delayed refs +	 */ +	trans->sync = true; +	ret = btrfs_run_delayed_refs(trans, async->root, async->count); +	if (ret) +		async->error = ret; + +	ret = btrfs_end_transaction(trans, async->root); +	if (ret && !async->error) +		async->error = ret; +done: +	if (async->sync) +		complete(&async->wait); +	else +		kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, +				 unsigned long count, int wait) +{ +	struct async_delayed_refs *async; +	int ret; + +	async = kmalloc(sizeof(*async), GFP_NOFS); +	if (!async) +		return -ENOMEM; + +	async->root = root->fs_info->tree_root; +	async->count = count; +	async->error = 0; +	if (wait) +		async->sync = 1; +	else +		async->sync = 0; +	init_completion(&async->wait); + +	btrfs_init_work(&async->work, delayed_ref_async_start, +			NULL, NULL); + +	btrfs_queue_work(root->fs_info->extent_workers, &async->work); + +	if (wait) { +		wait_for_completion(&async->wait); +		ret = async->error; +		kfree(async); +		return ret;  	} -	return count; +	return 0;  }  /* @@ -2209,97 +2769,101 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,   * 0, which means to process everything in the tree at the start   * of the run (but not newly added entries), or it can be some target   * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction   */  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, unsigned long count)  {  	struct rb_node *node;  	struct btrfs_delayed_ref_root *delayed_refs; -	struct btrfs_delayed_ref_node *ref; -	struct list_head cluster; +	struct btrfs_delayed_ref_head *head;  	int ret;  	int run_all = count == (unsigned long)-1;  	int run_most = 0; +	/* We'll clean this up in btrfs_cleanup_transaction */ +	if (trans->aborted) +		return 0; +  	if (root == root->fs_info->extent_root)  		root = root->fs_info->tree_root;  	delayed_refs = &trans->transaction->delayed_refs; -	INIT_LIST_HEAD(&cluster); -again: -	spin_lock(&delayed_refs->lock);  	if (count == 0) { -		count = delayed_refs->num_entries * 2; +		count = atomic_read(&delayed_refs->num_entries) * 2;  		run_most = 1;  	} -	while (1) { -		if (!(run_all || run_most) && -		    delayed_refs->num_heads_ready < 64) -			break; - -		/* -		 * go find something we can process in the rbtree.  We start at -		 * the beginning of the tree, and then build a cluster -		 * of refs to process starting at the first one we are able to -		 * lock -		 */ -		ret = btrfs_find_ref_cluster(trans, &cluster, -					     delayed_refs->run_delayed_start); -		if (ret) -			break; -		ret = run_clustered_refs(trans, root, &cluster); -		BUG_ON(ret < 0); - -		count -= min_t(unsigned long, ret, count); - -		if (count == 0) -			break; +again: +#ifdef SCRAMBLE_DELAYED_REFS +	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif +	ret = __btrfs_run_delayed_refs(trans, root, count); +	if (ret < 0) { +		btrfs_abort_transaction(trans, root, ret); +		return ret;  	}  	if (run_all) { -		node = rb_first(&delayed_refs->root); -		if (!node) +		if (!list_empty(&trans->new_bgs)) +			btrfs_create_pending_block_groups(trans, root); + +		spin_lock(&delayed_refs->lock); +		node = rb_first(&delayed_refs->href_root); +		if (!node) { +			spin_unlock(&delayed_refs->lock);  			goto out; +		}  		count = (unsigned long)-1;  		while (node) { -			ref = rb_entry(node, struct btrfs_delayed_ref_node, -				       rb_node); -			if (btrfs_delayed_ref_is_head(ref)) { -				struct btrfs_delayed_ref_head *head; +			head = rb_entry(node, struct btrfs_delayed_ref_head, +					href_node); +			if (btrfs_delayed_ref_is_head(&head->node)) { +				struct btrfs_delayed_ref_node *ref; -				head = btrfs_delayed_node_to_head(ref); +				ref = &head->node;  				atomic_inc(&ref->refs);  				spin_unlock(&delayed_refs->lock); +				/* +				 * Mutex was contended, block until it's +				 * released and try again +				 */  				mutex_lock(&head->mutex);  				mutex_unlock(&head->mutex);  				btrfs_put_delayed_ref(ref);  				cond_resched();  				goto again; +			} else { +				WARN_ON(1);  			}  			node = rb_next(node);  		}  		spin_unlock(&delayed_refs->lock); -		schedule_timeout(1); +		cond_resched();  		goto again;  	}  out: -	spin_unlock(&delayed_refs->lock); +	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); +	if (ret) +		return ret; +	assert_qgroups_uptodate(trans);  	return 0;  }  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  				struct btrfs_root *root,  				u64 bytenr, u64 num_bytes, u64 flags, -				int is_data) +				int level, int is_data)  {  	struct btrfs_delayed_extent_op *extent_op;  	int ret; -	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); +	extent_op = btrfs_alloc_delayed_extent_op();  	if (!extent_op)  		return -ENOMEM; @@ -2307,10 +2871,12 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  	extent_op->update_flags = 1;  	extent_op->update_key = 0;  	extent_op->is_data = is_data ? 1 : 0; +	extent_op->level = level; -	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); +	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, +					  num_bytes, extent_op);  	if (ret) -		kfree(extent_op); +		btrfs_free_delayed_extent_op(extent_op);  	return ret;  } @@ -2326,56 +2892,58 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,  	struct rb_node *node;  	int ret = 0; -	ret = -ENOENT;  	delayed_refs = &trans->transaction->delayed_refs;  	spin_lock(&delayed_refs->lock);  	head = btrfs_find_delayed_ref_head(trans, bytenr); -	if (!head) -		goto out; +	if (!head) { +		spin_unlock(&delayed_refs->lock); +		return 0; +	}  	if (!mutex_trylock(&head->mutex)) {  		atomic_inc(&head->node.refs);  		spin_unlock(&delayed_refs->lock); -		btrfs_release_path(root->fs_info->extent_root, path); +		btrfs_release_path(path); +		/* +		 * Mutex was contended, block until it's released and let +		 * caller try again +		 */  		mutex_lock(&head->mutex);  		mutex_unlock(&head->mutex);  		btrfs_put_delayed_ref(&head->node);  		return -EAGAIN;  	} +	spin_unlock(&delayed_refs->lock); -	node = rb_prev(&head->node.rb_node); -	if (!node) -		goto out_unlock; - -	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - -	if (ref->bytenr != bytenr) -		goto out_unlock; +	spin_lock(&head->lock); +	node = rb_first(&head->ref_root); +	while (node) { +		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); +		node = rb_next(node); -	ret = 1; -	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) -		goto out_unlock; +		/* If it's a shared ref we know a cross reference exists */ +		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { +			ret = 1; +			break; +		} -	data_ref = btrfs_delayed_node_to_data_ref(ref); +		data_ref = btrfs_delayed_node_to_data_ref(ref); -	node = rb_prev(node); -	if (node) { -		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); -		if (ref->bytenr == bytenr) -			goto out_unlock; +		/* +		 * If our ref doesn't match the one we're currently looking at +		 * then we have a cross reference. +		 */ +		if (data_ref->root != root->root_key.objectid || +		    data_ref->objectid != objectid || +		    data_ref->offset != offset) { +			ret = 1; +			break; +		}  	} - -	if (data_ref->root != root->root_key.objectid || -	    data_ref->objectid != objectid || data_ref->offset != offset) -		goto out_unlock; - -	ret = 0; -out_unlock: +	spin_unlock(&head->lock);  	mutex_unlock(&head->mutex); -out: -	spin_unlock(&delayed_refs->lock);  	return ret;  } @@ -2400,7 +2968,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);  	if (ret < 0)  		goto out; -	BUG_ON(ret == 0); +	BUG_ON(ret == 0); /* Corruption */  	ret = -ENOENT;  	if (path->slots[0] == 0) @@ -2486,130 +3054,10 @@ out:  	return ret;  } -#if 0 -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct extent_buffer *buf, u32 nr_extents) -{ -	struct btrfs_key key; -	struct btrfs_file_extent_item *fi; -	u64 root_gen; -	u32 nritems; -	int i; -	int level; -	int ret = 0; -	int shared = 0; - -	if (!root->ref_cows) -		return 0; - -	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { -		shared = 0; -		root_gen = root->root_key.offset; -	} else { -		shared = 1; -		root_gen = trans->transid - 1; -	} - -	level = btrfs_header_level(buf); -	nritems = btrfs_header_nritems(buf); - -	if (level == 0) { -		struct btrfs_leaf_ref *ref; -		struct btrfs_extent_info *info; - -		ref = btrfs_alloc_leaf_ref(root, nr_extents); -		if (!ref) { -			ret = -ENOMEM; -			goto out; -		} - -		ref->root_gen = root_gen; -		ref->bytenr = buf->start; -		ref->owner = btrfs_header_owner(buf); -		ref->generation = btrfs_header_generation(buf); -		ref->nritems = nr_extents; -		info = ref->extents; - -		for (i = 0; nr_extents > 0 && i < nritems; i++) { -			u64 disk_bytenr; -			btrfs_item_key_to_cpu(buf, &key, i); -			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) -				continue; -			fi = btrfs_item_ptr(buf, i, -					    struct btrfs_file_extent_item); -			if (btrfs_file_extent_type(buf, fi) == -			    BTRFS_FILE_EXTENT_INLINE) -				continue; -			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); -			if (disk_bytenr == 0) -				continue; - -			info->bytenr = disk_bytenr; -			info->num_bytes = -				btrfs_file_extent_disk_num_bytes(buf, fi); -			info->objectid = key.objectid; -			info->offset = key.offset; -			info++; -		} - -		ret = btrfs_add_leaf_ref(root, ref, shared); -		if (ret == -EEXIST && shared) { -			struct btrfs_leaf_ref *old; -			old = btrfs_lookup_leaf_ref(root, ref->bytenr); -			BUG_ON(!old); -			btrfs_remove_leaf_ref(root, old); -			btrfs_free_leaf_ref(root, old); -			ret = btrfs_add_leaf_ref(root, ref, shared); -		} -		WARN_ON(ret); -		btrfs_free_leaf_ref(root, ref); -	} -out: -	return ret; -} - -/* when a block goes through cow, we update the reference counts of - * everything that block points to.  The internal pointers of the block - * can be in just about any order, and it is likely to have clusters of - * things that are close together and clusters of things that are not. - * - * To help reduce the seeks that come with updating all of these reference - * counts, sort them by byte number before actual updates are done. - * - * struct refsort is used to match byte number to slot in the btree block. - * we sort based on the byte number and then use the slot to actually - * find the item. - * - * struct refsort is smaller than strcut btrfs_item and smaller than - * struct btrfs_key_ptr.  Since we're currently limited to the page size - * for a btree block, there's no way for a kmalloc of refsorts for a - * single node to be bigger than a page. - */ -struct refsort { -	u64 bytenr; -	u32 slot; -}; - -/* - * for passing into sort() - */ -static int refsort_cmp(const void *a_void, const void *b_void) -{ -	const struct refsort *a = a_void; -	const struct refsort *b = b_void; - -	if (a->bytenr < b->bytenr) -		return -1; -	if (a->bytenr > b->bytenr) -		return 1; -	return 0; -} -#endif -  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct extent_buffer *buf, -			   int full_backref, int inc) +			   int full_backref, int inc, int no_quota)  {  	u64 bytenr;  	u64 num_bytes; @@ -2622,13 +3070,17 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  	int level;  	int ret = 0;  	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, -			    u64, u64, u64, u64, u64, u64); +			    u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif  	ref_root = btrfs_header_owner(buf);  	nritems = btrfs_header_nritems(buf);  	level = btrfs_header_level(buf); -	if (!root->ref_cows && level == 0) +	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)  		return 0;  	if (inc) @@ -2659,34 +3111,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,  			key.offset -= btrfs_file_extent_offset(buf, fi);  			ret = process_func(trans, root, bytenr, num_bytes,  					   parent, ref_root, key.objectid, -					   key.offset); +					   key.offset, no_quota);  			if (ret)  				goto fail;  		} else {  			bytenr = btrfs_node_blockptr(buf, i);  			num_bytes = btrfs_level_size(root, level - 1);  			ret = process_func(trans, root, bytenr, num_bytes, -					   parent, ref_root, level - 1, 0); +					   parent, ref_root, level - 1, 0, +					   no_quota);  			if (ret)  				goto fail;  		}  	}  	return 0;  fail: -	BUG();  	return ret;  }  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);  }  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		  struct extent_buffer *buf, int full_backref) +		  struct extent_buffer *buf, int full_backref, int no_quota)  { -	return __btrfs_mod_ref(trans, root, buf, full_backref, 0); +	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);  }  static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2702,16 +3154,18 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,  	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);  	if (ret < 0)  		goto fail; -	BUG_ON(ret); +	BUG_ON(ret); /* Corruption */  	leaf = path->nodes[0];  	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);  	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));  	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(extent_root, path); +	btrfs_release_path(path);  fail: -	if (ret) +	if (ret) { +		btrfs_abort_transaction(trans, root, ret);  		return ret; +	}  	return 0;  } @@ -2741,6 +3195,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,  	struct btrfs_root *root = block_group->fs_info->tree_root;  	struct inode *inode = NULL;  	u64 alloc_hint = 0; +	int dcs = BTRFS_DC_ERROR;  	int num_pages = 0;  	int retries = 0;  	int ret = 0; @@ -2760,7 +3215,7 @@ again:  	inode = lookup_free_space_inode(root, block_group, path);  	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {  		ret = PTR_ERR(inode); -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		goto out;  	} @@ -2777,6 +3232,13 @@ again:  		goto again;  	} +	/* We've already setup this transaction, go ahead and exit */ +	if (block_group->cache_generation == trans->transid && +	    i_size_read(inode)) { +		dcs = BTRFS_DC_SETUP; +		goto out_put; +	} +  	/*  	 * We want to set the generation to 0, that way if anything goes wrong  	 * from here on out we know not to trust this cache when we load up next @@ -2787,30 +3249,41 @@ again:  	WARN_ON(ret);  	if (i_size_read(inode) > 0) { -		ret = btrfs_truncate_free_space_cache(root, trans, path, -						      inode); +		ret = btrfs_check_trunc_cache_free_space(root, +					&root->fs_info->global_block_rsv); +		if (ret) +			goto out_put; + +		ret = btrfs_truncate_free_space_cache(root, trans, inode);  		if (ret)  			goto out_put;  	}  	spin_lock(&block_group->lock); -	if (block_group->cached != BTRFS_CACHE_FINISHED) { +	if (block_group->cached != BTRFS_CACHE_FINISHED || +	    !btrfs_test_opt(root, SPACE_CACHE) || +	    block_group->delalloc_bytes) { +		/* +		 * don't bother trying to write stuff out _if_ +		 * a) we're not cached, +		 * b) we're with nospace_cache mount option. +		 */ +		dcs = BTRFS_DC_WRITTEN;  		spin_unlock(&block_group->lock);  		goto out_put;  	}  	spin_unlock(&block_group->lock); -	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); +	/* +	 * Try to preallocate enough space based on how big the block group is. +	 * Keep in mind this has to include any pinned space which could end up +	 * taking up quite a bit since it's not folded into the other space +	 * cache. +	 */ +	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);  	if (!num_pages)  		num_pages = 1; -	/* -	 * Just to make absolutely sure we have enough space, we're going to -	 * preallocate 12 pages worth of space for each block group.  In -	 * practice we ought to use at most 8, but we need extra space so we can -	 * add our header and have a terminator between the extents and the -	 * bitmaps. -	 */  	num_pages *= 16;  	num_pages *= PAGE_CACHE_SIZE; @@ -2821,17 +3294,19 @@ again:  	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,  					      num_pages, num_pages,  					      &alloc_hint); +	if (!ret) +		dcs = BTRFS_DC_SETUP;  	btrfs_free_reserved_data_space(inode, num_pages); +  out_put:  	iput(inode);  out_free: -	btrfs_release_path(root, path); +	btrfs_release_path(path);  out:  	spin_lock(&block_group->lock); -	if (ret) -		block_group->disk_cache_state = BTRFS_DC_ERROR; -	else -		block_group->disk_cache_state = BTRFS_DC_SETUP; +	if (!ret && dcs == BTRFS_DC_SETUP) +		block_group->cache_generation = trans->transid; +	block_group->disk_cache_state = dcs;  	spin_unlock(&block_group->lock);  	return ret; @@ -2872,7 +3347,8 @@ again:  		if (last == 0) {  			err = btrfs_run_delayed_refs(trans, root,  						     (unsigned long)-1); -			BUG_ON(err); +			if (err) /* File system offline */ +				goto out;  		}  		cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2899,8 +3375,9 @@ again:  		last = cache->key.objectid + cache->key.offset;  		err = write_one_cache_group(trans, root, path, cache); -		BUG_ON(err);  		btrfs_put_block_group(cache); +		if (err) /* File system offline */ +			goto out;  	}  	while (1) { @@ -2912,7 +3389,8 @@ again:  		if (last == 0) {  			err = btrfs_run_delayed_refs(trans, root,  						     (unsigned long)-1); -			BUG_ON(err); +			if (err) /* File system offline */ +				goto out;  		}  		cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2937,20 +3415,21 @@ again:  			continue;  		} -		btrfs_write_out_cache(root, trans, cache, path); +		err = btrfs_write_out_cache(root, trans, cache, path);  		/*  		 * If we didn't have an error then the cache state is still  		 * NEED_WRITE, so we can set it to WRITTEN.  		 */ -		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) +		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)  			cache->disk_cache_state = BTRFS_DC_WRITTEN;  		last = cache->key.objectid + cache->key.offset;  		btrfs_put_block_group(cache);  	} +out:  	btrfs_free_path(path); -	return 0; +	return err;  }  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2966,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)  	return readonly;  } +static const char *alloc_name(u64 flags) +{ +	switch (flags) { +	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: +		return "mixed"; +	case BTRFS_BLOCK_GROUP_METADATA: +		return "metadata"; +	case BTRFS_BLOCK_GROUP_DATA: +		return "data"; +	case BTRFS_BLOCK_GROUP_SYSTEM: +		return "system"; +	default: +		WARN_ON(1); +		return "invalid-combination"; +	}; +} +  static int update_space_info(struct btrfs_fs_info *info, u64 flags,  			     u64 total_bytes, u64 bytes_used,  			     struct btrfs_space_info **space_info) @@ -2973,6 +3469,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	struct btrfs_space_info *found;  	int i;  	int factor; +	int ret;  	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |  		     BTRFS_BLOCK_GROUP_RAID10)) @@ -2996,13 +3493,17 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	if (!found)  		return -ENOMEM; +	ret = percpu_counter_init(&found->total_bytes_pinned, 0); +	if (ret) { +		kfree(found); +		return ret; +	} +  	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)  		INIT_LIST_HEAD(&found->block_groups[i]);  	init_rwsem(&found->groups_sem);  	spin_lock_init(&found->lock); -	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | -				BTRFS_BLOCK_GROUP_SYSTEM | -				BTRFS_BLOCK_GROUP_METADATA); +	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;  	found->total_bytes = total_bytes;  	found->disk_total = total_bytes * factor;  	found->bytes_used = bytes_used; @@ -3012,74 +3513,156 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	found->bytes_readonly = 0;  	found->bytes_may_use = 0;  	found->full = 0; -	found->force_alloc = 0; +	found->force_alloc = CHUNK_ALLOC_NO_FORCE; +	found->chunk_alloc = 0; +	found->flush = 0; +	init_waitqueue_head(&found->wait); + +	ret = kobject_init_and_add(&found->kobj, &space_info_ktype, +				    info->space_info_kobj, "%s", +				    alloc_name(found->flags)); +	if (ret) { +		kfree(found); +		return ret; +	} +  	*space_info = found;  	list_add_rcu(&found->list, &info->space_info); -	atomic_set(&found->caching_threads, 0); -	return 0; +	if (flags & BTRFS_BLOCK_GROUP_DATA) +		info->data_sinfo = found; + +	return ret;  }  static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)  { -	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | -				   BTRFS_BLOCK_GROUP_RAID1 | -				   BTRFS_BLOCK_GROUP_RAID10 | -				   BTRFS_BLOCK_GROUP_DUP); -	if (extra_flags) { -		if (flags & BTRFS_BLOCK_GROUP_DATA) -			fs_info->avail_data_alloc_bits |= extra_flags; -		if (flags & BTRFS_BLOCK_GROUP_METADATA) -			fs_info->avail_metadata_alloc_bits |= extra_flags; -		if (flags & BTRFS_BLOCK_GROUP_SYSTEM) -			fs_info->avail_system_alloc_bits |= extra_flags; +	u64 extra_flags = chunk_to_extended(flags) & +				BTRFS_EXTENDED_PROFILE_MASK; + +	write_seqlock(&fs_info->profiles_lock); +	if (flags & BTRFS_BLOCK_GROUP_DATA) +		fs_info->avail_data_alloc_bits |= extra_flags; +	if (flags & BTRFS_BLOCK_GROUP_METADATA) +		fs_info->avail_metadata_alloc_bits |= extra_flags; +	if (flags & BTRFS_BLOCK_GROUP_SYSTEM) +		fs_info->avail_system_alloc_bits |= extra_flags; +	write_sequnlock(&fs_info->profiles_lock); +} + +/* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ +	struct btrfs_balance_control *bctl = fs_info->balance_ctl; +	u64 target = 0; + +	if (!bctl) +		return 0; + +	if (flags & BTRFS_BLOCK_GROUP_DATA && +	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; +	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && +		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; +	} else if (flags & BTRFS_BLOCK_GROUP_METADATA && +		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { +		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;  	} + +	return target;  } -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format.  If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)  { -	u64 num_devices = root->fs_info->fs_devices->rw_devices; +	/* +	 * we add in the count of missing devices because we want +	 * to make sure that any RAID levels on a degraded FS +	 * continue to be honored. +	 */ +	u64 num_devices = root->fs_info->fs_devices->rw_devices + +		root->fs_info->fs_devices->missing_devices; +	u64 target; +	u64 tmp; + +	/* +	 * see if restripe for this chunk_type is in progress, if so +	 * try to reduce to the target profile +	 */ +	spin_lock(&root->fs_info->balance_lock); +	target = get_restripe_target(root->fs_info, flags); +	if (target) { +		/* pick target profile only if it's already available */ +		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { +			spin_unlock(&root->fs_info->balance_lock); +			return extended_to_chunk(target); +		} +	} +	spin_unlock(&root->fs_info->balance_lock); +	/* First, mask out the RAID levels which aren't possible */  	if (num_devices == 1) -		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); +		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | +			   BTRFS_BLOCK_GROUP_RAID5); +	if (num_devices < 3) +		flags &= ~BTRFS_BLOCK_GROUP_RAID6;  	if (num_devices < 4)  		flags &= ~BTRFS_BLOCK_GROUP_RAID10; -	if ((flags & BTRFS_BLOCK_GROUP_DUP) && -	    (flags & (BTRFS_BLOCK_GROUP_RAID1 | -		      BTRFS_BLOCK_GROUP_RAID10))) { -		flags &= ~BTRFS_BLOCK_GROUP_DUP; -	} +	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | +		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); +	flags &= ~tmp; -	if ((flags & BTRFS_BLOCK_GROUP_RAID1) && -	    (flags & BTRFS_BLOCK_GROUP_RAID10)) { -		flags &= ~BTRFS_BLOCK_GROUP_RAID1; -	} +	if (tmp & BTRFS_BLOCK_GROUP_RAID6) +		tmp = BTRFS_BLOCK_GROUP_RAID6; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID5) +		tmp = BTRFS_BLOCK_GROUP_RAID5; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID10) +		tmp = BTRFS_BLOCK_GROUP_RAID10; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID1) +		tmp = BTRFS_BLOCK_GROUP_RAID1; +	else if (tmp & BTRFS_BLOCK_GROUP_RAID0) +		tmp = BTRFS_BLOCK_GROUP_RAID0; -	if ((flags & BTRFS_BLOCK_GROUP_RAID0) && -	    ((flags & BTRFS_BLOCK_GROUP_RAID1) | -	     (flags & BTRFS_BLOCK_GROUP_RAID10) | -	     (flags & BTRFS_BLOCK_GROUP_DUP))) -		flags &= ~BTRFS_BLOCK_GROUP_RAID0; -	return flags; +	return extended_to_chunk(flags | tmp);  } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)  { -	if (flags & BTRFS_BLOCK_GROUP_DATA) -		flags |= root->fs_info->avail_data_alloc_bits & -			 root->fs_info->data_alloc_profile; -	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) -		flags |= root->fs_info->avail_system_alloc_bits & -			 root->fs_info->system_alloc_profile; -	else if (flags & BTRFS_BLOCK_GROUP_METADATA) -		flags |= root->fs_info->avail_metadata_alloc_bits & -			 root->fs_info->metadata_alloc_profile; +	unsigned seq; +	u64 flags; + +	do { +		flags = orig_flags; +		seq = read_seqbegin(&root->fs_info->profiles_lock); + +		if (flags & BTRFS_BLOCK_GROUP_DATA) +			flags |= root->fs_info->avail_data_alloc_bits; +		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) +			flags |= root->fs_info->avail_system_alloc_bits; +		else if (flags & BTRFS_BLOCK_GROUP_METADATA) +			flags |= root->fs_info->avail_metadata_alloc_bits; +	} while (read_seqretry(&root->fs_info->profiles_lock, seq)); +  	return btrfs_reduce_alloc_profile(root, flags);  } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  {  	u64 flags; +	u64 ret;  	if (data)  		flags = BTRFS_BLOCK_GROUP_DATA; @@ -3088,13 +3671,8 @@ static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)  	else  		flags = BTRFS_BLOCK_GROUP_METADATA; -	return get_alloc_profile(root, flags); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ -	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, -						       BTRFS_BLOCK_GROUP_DATA); +	ret = get_alloc_profile(root, flags); +	return ret;  }  /* @@ -3105,18 +3683,19 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)  {  	struct btrfs_space_info *data_sinfo;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_fs_info *fs_info = root->fs_info;  	u64 used;  	int ret = 0, committed = 0, alloc_chunk = 1;  	/* make sure bytes are sectorsize aligned */ -	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); +	bytes = ALIGN(bytes, root->sectorsize); -	if (root == root->fs_info->tree_root) { -		alloc_chunk = 0; +	if (btrfs_is_free_space_inode(inode)) {  		committed = 1; +		ASSERT(current->journal_info);  	} -	data_sinfo = BTRFS_I(inode)->space_info; +	data_sinfo = fs_info->data_sinfo;  	if (!data_sinfo)  		goto alloc; @@ -3137,33 +3716,57 @@ again:  		if (!data_sinfo->full && alloc_chunk) {  			u64 alloc_target; -			data_sinfo->force_alloc = 1; +			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;  			spin_unlock(&data_sinfo->lock);  alloc:  			alloc_target = btrfs_get_alloc_profile(root, 1); -			trans = btrfs_join_transaction(root, 1); +			/* +			 * It is ugly that we don't call nolock join +			 * transaction for the free space inode case here. +			 * But it is safe because we only do the data space +			 * reservation for the free space cache in the +			 * transaction context, the common join transaction +			 * just increase the counter of the current transaction +			 * handler, doesn't try to acquire the trans_lock of +			 * the fs. +			 */ +			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans))  				return PTR_ERR(trans);  			ret = do_chunk_alloc(trans, root->fs_info->extent_root, -					     bytes + 2 * 1024 * 1024, -					     alloc_target, 0); +					     alloc_target, +					     CHUNK_ALLOC_NO_FORCE);  			btrfs_end_transaction(trans, root); -			if (ret < 0) -				return ret; - -			if (!data_sinfo) { -				btrfs_set_inode_space_info(root, inode); -				data_sinfo = BTRFS_I(inode)->space_info; +			if (ret < 0) { +				if (ret != -ENOSPC) +					return ret; +				else +					goto commit_trans;  			} + +			if (!data_sinfo) +				data_sinfo = fs_info->data_sinfo; +  			goto again;  		} + +		/* +		 * If we don't have enough pinned space to deal with this +		 * allocation don't bother committing the transaction. +		 */ +		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, +					   bytes) < 0) +			committed = 1;  		spin_unlock(&data_sinfo->lock);  		/* commit the current transaction and try again */ -		if (!committed && !root->fs_info->open_ioctl_trans) { +commit_trans: +		if (!committed && +		    !atomic_read(&root->fs_info->open_ioctl_trans)) {  			committed = 1; -			trans = btrfs_join_transaction(root, 1); + +			trans = btrfs_join_transaction(root);  			if (IS_ERR(trans))  				return PTR_ERR(trans);  			ret = btrfs_commit_transaction(trans, root); @@ -3172,31 +3775,21 @@ alloc:  			goto again;  		} -#if 0 /* I hope we never need this code again, just in case */ -		printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " -		       "%llu bytes_reserved, " "%llu bytes_pinned, " -		       "%llu bytes_readonly, %llu may use %llu total\n", -		       (unsigned long long)bytes, -		       (unsigned long long)data_sinfo->bytes_used, -		       (unsigned long long)data_sinfo->bytes_reserved, -		       (unsigned long long)data_sinfo->bytes_pinned, -		       (unsigned long long)data_sinfo->bytes_readonly, -		       (unsigned long long)data_sinfo->bytes_may_use, -		       (unsigned long long)data_sinfo->total_bytes); -#endif +		trace_btrfs_space_reservation(root->fs_info, +					      "space_info:enospc", +					      data_sinfo->flags, bytes, 1);  		return -ENOSPC;  	}  	data_sinfo->bytes_may_use += bytes; -	BTRFS_I(inode)->reserved_bytes += bytes; +	trace_btrfs_space_reservation(root->fs_info, "space_info", +				      data_sinfo->flags, bytes, 1);  	spin_unlock(&data_sinfo->lock);  	return 0;  }  /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode.   */  void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  { @@ -3204,12 +3797,14 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  	struct btrfs_space_info *data_sinfo;  	/* make sure bytes are sectorsize aligned */ -	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); +	bytes = ALIGN(bytes, root->sectorsize); -	data_sinfo = BTRFS_I(inode)->space_info; +	data_sinfo = root->fs_info->data_sinfo;  	spin_lock(&data_sinfo->lock); +	WARN_ON(data_sinfo->bytes_may_use < bytes);  	data_sinfo->bytes_may_use -= bytes; -	BTRFS_I(inode)->reserved_bytes -= bytes; +	trace_btrfs_space_reservation(root->fs_info, "space_info", +				      data_sinfo->flags, bytes, 0);  	spin_unlock(&data_sinfo->lock);  } @@ -3221,69 +3816,159 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)  	rcu_read_lock();  	list_for_each_entry_rcu(found, head, list) {  		if (found->flags & BTRFS_BLOCK_GROUP_METADATA) -			found->force_alloc = 1; +			found->force_alloc = CHUNK_ALLOC_FORCE;  	}  	rcu_read_unlock();  } +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ +	return (global->size << 1); +} +  static int should_alloc_chunk(struct btrfs_root *root, -			      struct btrfs_space_info *sinfo, u64 alloc_bytes) +			      struct btrfs_space_info *sinfo, int force)  { +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;  	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; +	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;  	u64 thresh; -	if (sinfo->bytes_used + sinfo->bytes_reserved + -	    alloc_bytes + 256 * 1024 * 1024 < num_bytes) -		return 0; +	if (force == CHUNK_ALLOC_FORCE) +		return 1; -	if (sinfo->bytes_used + sinfo->bytes_reserved + -	    alloc_bytes < div_factor(num_bytes, 8)) -		return 0; +	/* +	 * We need to take into account the global rsv because for all intents +	 * and purposes it's used space.  Don't worry about locking the +	 * global_rsv, it doesn't change except when the transaction commits. +	 */ +	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) +		num_allocated += calc_global_rsv_need_space(global_rsv); -	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); -	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); +	/* +	 * in limited mode, we want to have some free space up to +	 * about 1% of the FS size. +	 */ +	if (force == CHUNK_ALLOC_LIMITED) { +		thresh = btrfs_super_total_bytes(root->fs_info->super_copy); +		thresh = max_t(u64, 64 * 1024 * 1024, +			       div_factor_fine(thresh, 1)); -	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) -		return 0; +		if (num_bytes - num_allocated < thresh) +			return 1; +	} +	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) +		return 0;  	return 1;  } +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ +	u64 num_dev; + +	if (type & (BTRFS_BLOCK_GROUP_RAID10 | +		    BTRFS_BLOCK_GROUP_RAID0 | +		    BTRFS_BLOCK_GROUP_RAID5 | +		    BTRFS_BLOCK_GROUP_RAID6)) +		num_dev = root->fs_info->fs_devices->rw_devices; +	else if (type & BTRFS_BLOCK_GROUP_RAID1) +		num_dev = 2; +	else +		num_dev = 1;	/* DUP or single */ + +	/* metadata for updaing devices and chunk tree */ +	return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, +			       struct btrfs_root *root, u64 type) +{ +	struct btrfs_space_info *info; +	u64 left; +	u64 thresh; + +	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +	spin_lock(&info->lock); +	left = info->total_bytes - info->bytes_used - info->bytes_pinned - +		info->bytes_reserved - info->bytes_readonly; +	spin_unlock(&info->lock); + +	thresh = get_system_chunk_thresh(root, type); +	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { +		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", +			left, thresh, type); +		dump_space_info(info, 0, 0); +	} + +	if (left < thresh) { +		u64 flags; + +		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); +		btrfs_alloc_chunk(trans, root, flags); +	} +} +  static int do_chunk_alloc(struct btrfs_trans_handle *trans, -			  struct btrfs_root *extent_root, u64 alloc_bytes, -			  u64 flags, int force) +			  struct btrfs_root *extent_root, u64 flags, int force)  {  	struct btrfs_space_info *space_info;  	struct btrfs_fs_info *fs_info = extent_root->fs_info; +	int wait_for_alloc = 0;  	int ret = 0; -	mutex_lock(&fs_info->chunk_mutex); - -	flags = btrfs_reduce_alloc_profile(extent_root, flags); +	/* Don't re-enter if we're already allocating a chunk */ +	if (trans->allocating_chunk) +		return -ENOSPC;  	space_info = __find_space_info(extent_root->fs_info, flags);  	if (!space_info) {  		ret = update_space_info(extent_root->fs_info, flags,  					0, 0, &space_info); -		BUG_ON(ret); +		BUG_ON(ret); /* -ENOMEM */  	} -	BUG_ON(!space_info); +	BUG_ON(!space_info); /* Logic error */ +again:  	spin_lock(&space_info->lock); -	if (space_info->force_alloc) -		force = 1; +	if (force < space_info->force_alloc) +		force = space_info->force_alloc;  	if (space_info->full) { +		if (should_alloc_chunk(extent_root, space_info, force)) +			ret = -ENOSPC; +		else +			ret = 0;  		spin_unlock(&space_info->lock); -		goto out; +		return ret;  	} -	if (!force && !should_alloc_chunk(extent_root, space_info, -					  alloc_bytes)) { +	if (!should_alloc_chunk(extent_root, space_info, force)) {  		spin_unlock(&space_info->lock); -		goto out; +		return 0; +	} else if (space_info->chunk_alloc) { +		wait_for_alloc = 1; +	} else { +		space_info->chunk_alloc = 1;  	} +  	spin_unlock(&space_info->lock); +	mutex_lock(&fs_info->chunk_mutex); + +	/* +	 * The chunk_mutex is held throughout the entirety of a chunk +	 * allocation, so once we've acquired the chunk_mutex we know that the +	 * other guy is done and we need to recheck and see if we should +	 * allocate. +	 */ +	if (wait_for_alloc) { +		mutex_unlock(&fs_info->chunk_mutex); +		wait_for_alloc = 0; +		goto again; +	} + +	trans->allocating_chunk = true; +  	/*  	 * If we have mixed data/metadata chunks we want to make sure we keep  	 * allocating mixed chunks instead of individual chunks. @@ -3303,106 +3988,472 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,  			force_metadata_allocation(fs_info);  	} +	/* +	 * Check if we have enough space in SYSTEM chunk because we may need +	 * to update devices. +	 */ +	check_system_chunk(trans, extent_root, flags); +  	ret = btrfs_alloc_chunk(trans, extent_root, flags); +	trans->allocating_chunk = false; +  	spin_lock(&space_info->lock); +	if (ret < 0 && ret != -ENOSPC) +		goto out;  	if (ret)  		space_info->full = 1;  	else  		ret = 1; -	space_info->force_alloc = 0; -	spin_unlock(&space_info->lock); + +	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;  out: -	mutex_unlock(&extent_root->fs_info->chunk_mutex); +	space_info->chunk_alloc = 0; +	spin_unlock(&space_info->lock); +	mutex_unlock(&fs_info->chunk_mutex);  	return ret;  } +static int can_overcommit(struct btrfs_root *root, +			  struct btrfs_space_info *space_info, u64 bytes, +			  enum btrfs_reserve_flush_enum flush) +{ +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; +	u64 profile = btrfs_get_alloc_profile(root, 0); +	u64 space_size; +	u64 avail; +	u64 used; + +	used = space_info->bytes_used + space_info->bytes_reserved + +		space_info->bytes_pinned + space_info->bytes_readonly; + +	/* +	 * We only want to allow over committing if we have lots of actual space +	 * free, but if we don't have enough space to handle the global reserve +	 * space then we could end up having a real enospc problem when trying +	 * to allocate a chunk or some other such important allocation. +	 */ +	spin_lock(&global_rsv->lock); +	space_size = calc_global_rsv_need_space(global_rsv); +	spin_unlock(&global_rsv->lock); +	if (used + space_size >= space_info->total_bytes) +		return 0; + +	used += space_info->bytes_may_use; + +	spin_lock(&root->fs_info->free_chunk_lock); +	avail = root->fs_info->free_chunk_space; +	spin_unlock(&root->fs_info->free_chunk_lock); + +	/* +	 * If we have dup, raid1 or raid10 then only half of the free +	 * space is actually useable.  For raid56, the space info used +	 * doesn't include the parity drive, so we don't have to +	 * change the math +	 */ +	if (profile & (BTRFS_BLOCK_GROUP_DUP | +		       BTRFS_BLOCK_GROUP_RAID1 | +		       BTRFS_BLOCK_GROUP_RAID10)) +		avail >>= 1; + +	/* +	 * If we aren't flushing all things, let us overcommit up to +	 * 1/2th of the space. If we can flush, don't let us overcommit +	 * too much, let it overcommit up to 1/8 of the space. +	 */ +	if (flush == BTRFS_RESERVE_FLUSH_ALL) +		avail >>= 3; +	else +		avail >>= 1; + +	if (used + bytes < space_info->total_bytes + avail) +		return 1; +	return 0; +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, +					 unsigned long nr_pages, int nr_items) +{ +	struct super_block *sb = root->fs_info->sb; + +	if (down_read_trylock(&sb->s_umount)) { +		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); +		up_read(&sb->s_umount); +	} else { +		/* +		 * We needn't worry the filesystem going from r/w to r/o though +		 * we don't acquire ->s_umount mutex, because the filesystem +		 * should guarantee the delalloc inodes list be empty after +		 * the filesystem is readonly(all dirty pages are written to +		 * the disk). +		 */ +		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); +		if (!current->journal_info) +			btrfs_wait_ordered_roots(root->fs_info, nr_items); +	} +} + +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ +	u64 bytes; +	int nr; + +	bytes = btrfs_calc_trans_metadata_size(root, 1); +	nr = (int)div64_u64(to_reclaim, bytes); +	if (!nr) +		nr = 1; +	return nr; +} + +#define EXTENT_SIZE_PER_ITEM	(256 * 1024) +  /*   * shrink metadata reservation for delalloc   */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, -			   struct btrfs_root *root, u64 to_reclaim, int sync) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, +			    bool wait_ordered)  {  	struct btrfs_block_rsv *block_rsv;  	struct btrfs_space_info *space_info; -	u64 reserved; +	struct btrfs_trans_handle *trans; +	u64 delalloc_bytes;  	u64 max_reclaim; -	u64 reclaimed = 0; -	int pause = 1; -	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; +	long time_left; +	unsigned long nr_pages; +	int loops; +	int items; +	enum btrfs_reserve_flush_enum flush; +	/* Calc the number of the pages we need flush for space reservation */ +	items = calc_reclaim_items_nr(root, to_reclaim); +	to_reclaim = items * EXTENT_SIZE_PER_ITEM; + +	trans = (struct btrfs_trans_handle *)current->journal_info;  	block_rsv = &root->fs_info->delalloc_block_rsv;  	space_info = block_rsv->space_info; -	smp_mb(); -	reserved = space_info->bytes_reserved; - -	if (reserved == 0) -		return 0; - -	max_reclaim = min(reserved, to_reclaim); +	delalloc_bytes = percpu_counter_sum_positive( +						&root->fs_info->delalloc_bytes); +	if (delalloc_bytes == 0) { +		if (trans) +			return; +		if (wait_ordered) +			btrfs_wait_ordered_roots(root->fs_info, items); +		return; +	} -	while (1) { -		/* have the flusher threads jump in and do some IO */ -		smp_mb(); -		nr_pages = min_t(unsigned long, nr_pages, -		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); -		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); +	loops = 0; +	while (delalloc_bytes && loops < 3) { +		max_reclaim = min(delalloc_bytes, to_reclaim); +		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; +		btrfs_writeback_inodes_sb_nr(root, nr_pages, items); +		/* +		 * We need to wait for the async pages to actually start before +		 * we do anything. +		 */ +		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); +		if (!max_reclaim) +			goto skip_async; +		if (max_reclaim <= nr_pages) +			max_reclaim = 0; +		else +			max_reclaim -= nr_pages; + +		wait_event(root->fs_info->async_submit_wait, +			   atomic_read(&root->fs_info->async_delalloc_pages) <= +			   (int)max_reclaim); +skip_async: +		if (!trans) +			flush = BTRFS_RESERVE_FLUSH_ALL; +		else +			flush = BTRFS_RESERVE_NO_FLUSH;  		spin_lock(&space_info->lock); -		if (reserved > space_info->bytes_reserved) -			reclaimed += reserved - space_info->bytes_reserved; -		reserved = space_info->bytes_reserved; +		if (can_overcommit(root, space_info, orig, flush)) { +			spin_unlock(&space_info->lock); +			break; +		}  		spin_unlock(&space_info->lock); -		if (reserved == 0 || reclaimed >= max_reclaim) +		loops++; +		if (wait_ordered && !trans) { +			btrfs_wait_ordered_roots(root->fs_info, items); +		} else { +			time_left = schedule_timeout_killable(1); +			if (time_left) +				break; +		} +		delalloc_bytes = percpu_counter_sum_positive( +						&root->fs_info->delalloc_bytes); +	} +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does.  Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_root *root, +				  struct btrfs_space_info *space_info, +				  u64 bytes, int force) +{ +	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; +	struct btrfs_trans_handle *trans; + +	trans = (struct btrfs_trans_handle *)current->journal_info; +	if (trans) +		return -EAGAIN; + +	if (force) +		goto commit; + +	/* See if there is enough pinned space to make this reservation */ +	if (percpu_counter_compare(&space_info->total_bytes_pinned, +				   bytes) >= 0) +		goto commit; + +	/* +	 * See if there is some space in the delayed insertion reservation for +	 * this reservation. +	 */ +	if (space_info != delayed_rsv->space_info) +		return -ENOSPC; + +	spin_lock(&delayed_rsv->lock); +	if (percpu_counter_compare(&space_info->total_bytes_pinned, +				   bytes - delayed_rsv->size) >= 0) { +		spin_unlock(&delayed_rsv->lock); +		return -ENOSPC; +	} +	spin_unlock(&delayed_rsv->lock); + +commit: +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) +		return -ENOSPC; + +	return btrfs_commit_transaction(trans, root); +} + +enum flush_state { +	FLUSH_DELAYED_ITEMS_NR	=	1, +	FLUSH_DELAYED_ITEMS	=	2, +	FLUSH_DELALLOC		=	3, +	FLUSH_DELALLOC_WAIT	=	4, +	ALLOC_CHUNK		=	5, +	COMMIT_TRANS		=	6, +}; + +static int flush_space(struct btrfs_root *root, +		       struct btrfs_space_info *space_info, u64 num_bytes, +		       u64 orig_bytes, int state) +{ +	struct btrfs_trans_handle *trans; +	int nr; +	int ret = 0; + +	switch (state) { +	case FLUSH_DELAYED_ITEMS_NR: +	case FLUSH_DELAYED_ITEMS: +		if (state == FLUSH_DELAYED_ITEMS_NR) +			nr = calc_reclaim_items_nr(root, num_bytes) * 2; +		else +			nr = -1; + +		trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans);  			break; +		} +		ret = btrfs_run_delayed_items_nr(trans, root, nr); +		btrfs_end_transaction(trans, root); +		break; +	case FLUSH_DELALLOC: +	case FLUSH_DELALLOC_WAIT: +		shrink_delalloc(root, num_bytes * 2, orig_bytes, +				state == FLUSH_DELALLOC_WAIT); +		break; +	case ALLOC_CHUNK: +		trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			break; +		} +		ret = do_chunk_alloc(trans, root->fs_info->extent_root, +				     btrfs_get_alloc_profile(root, 0), +				     CHUNK_ALLOC_NO_FORCE); +		btrfs_end_transaction(trans, root); +		if (ret == -ENOSPC) +			ret = 0; +		break; +	case COMMIT_TRANS: +		ret = may_commit_transaction(root, space_info, orig_bytes, 0); +		break; +	default: +		ret = -ENOSPC; +		break; +	} -		if (trans && trans->transaction->blocked) -			return -EAGAIN; +	return ret; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, +				 struct btrfs_space_info *space_info) +{ +	u64 used; +	u64 expected; +	u64 to_reclaim; + +	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, +				16 * 1024 * 1024); +	spin_lock(&space_info->lock); +	if (can_overcommit(root, space_info, to_reclaim, +			   BTRFS_RESERVE_FLUSH_ALL)) { +		to_reclaim = 0; +		goto out; +	} + +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (can_overcommit(root, space_info, 1024 * 1024, +			   BTRFS_RESERVE_FLUSH_ALL)) +		expected = div_factor_fine(space_info->total_bytes, 95); +	else +		expected = div_factor_fine(space_info->total_bytes, 90); + +	if (used > expected) +		to_reclaim = used - expected; +	else +		to_reclaim = 0; +	to_reclaim = min(to_reclaim, space_info->bytes_may_use + +				     space_info->bytes_reserved); +out: +	spin_unlock(&space_info->lock); + +	return to_reclaim; +} -		__set_current_state(TASK_INTERRUPTIBLE); -		schedule_timeout(pause); -		pause <<= 1; -		if (pause > HZ / 10) -			pause = HZ / 10; +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, +					struct btrfs_fs_info *fs_info, u64 used) +{ +	return (used >= div_factor_fine(space_info->total_bytes, 98) && +		!btrfs_fs_closing(fs_info) && +		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, +				       struct btrfs_fs_info *fs_info) +{ +	u64 used; +	spin_lock(&space_info->lock); +	used = space_info->bytes_used + space_info->bytes_reserved + +	       space_info->bytes_pinned + space_info->bytes_readonly + +	       space_info->bytes_may_use; +	if (need_do_async_reclaim(space_info, fs_info, used)) { +		spin_unlock(&space_info->lock); +		return 1;  	} -	return reclaimed >= to_reclaim; +	spin_unlock(&space_info->lock); + +	return 0;  } -/* - * Retries tells us how many times we've called reserve_metadata_bytes.  The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space.  That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ +	struct btrfs_fs_info *fs_info; +	struct btrfs_space_info *space_info; +	u64 to_reclaim; +	int flush_state; + +	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); +	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, +						      space_info); +	if (!to_reclaim) +		return; + +	flush_state = FLUSH_DELAYED_ITEMS_NR; +	do { +		flush_space(fs_info->fs_root, space_info, to_reclaim, +			    to_reclaim, flush_state); +		flush_state++; +		if (!btrfs_need_do_async_reclaim(space_info, fs_info)) +			return; +	} while (flush_state <= COMMIT_TRANS); + +	if (btrfs_need_do_async_reclaim(space_info, fs_info)) +		queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ +	INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation   * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv.  If there is not enough space it will make an attempt to + * flush out space to make room.  It will do this by flushing delalloc if + * possible or committing the transaction.  If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already.   */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, -				  struct btrfs_root *root, +static int reserve_metadata_bytes(struct btrfs_root *root,  				  struct btrfs_block_rsv *block_rsv, -				  u64 orig_bytes, int flush) +				  u64 orig_bytes, +				  enum btrfs_reserve_flush_enum flush)  {  	struct btrfs_space_info *space_info = block_rsv->space_info; -	u64 unused; +	u64 used;  	u64 num_bytes = orig_bytes; -	int retries = 0; +	int flush_state = FLUSH_DELAYED_ITEMS_NR;  	int ret = 0; -	bool reserved = false; -	bool committed = false; +	bool flushing = false;  again: -	ret = -ENOSPC; -	if (reserved) -		num_bytes = 0; - +	ret = 0;  	spin_lock(&space_info->lock); -	unused = space_info->bytes_used + space_info->bytes_reserved + -		 space_info->bytes_pinned + space_info->bytes_readonly + -		 space_info->bytes_may_use; +	/* +	 * We only want to wait if somebody other than us is flushing and we +	 * are actually allowed to flush all things. +	 */ +	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && +	       space_info->flush) { +		spin_unlock(&space_info->lock); +		/* +		 * If we have a trans handle we can't wait because the flusher +		 * may have to commit the transaction, which would mean we would +		 * deadlock since we are waiting for the flusher to finish, but +		 * hold the current transaction open. +		 */ +		if (current->journal_info) +			return -EAGAIN; +		ret = wait_event_killable(space_info->wait, !space_info->flush); +		/* Must have been killed, return */ +		if (ret) +			return -EINTR; + +		spin_lock(&space_info->lock); +	} + +	ret = -ENOSPC; +	used = space_info->bytes_used + space_info->bytes_reserved + +		space_info->bytes_pinned + space_info->bytes_readonly + +		space_info->bytes_may_use;  	/*  	 * The idea here is that we've not already over-reserved the block group @@ -3411,11 +4462,11 @@ again:  	 * lets start flushing stuff first and then come back and try to make  	 * our reservation.  	 */ -	if (unused <= space_info->total_bytes) { -		unused -= space_info->total_bytes; -		if (unused >= num_bytes) { -			if (!reserved) -				space_info->bytes_reserved += orig_bytes; +	if (used <= space_info->total_bytes) { +		if (used + orig_bytes <= space_info->total_bytes) { +			space_info->bytes_may_use += orig_bytes; +			trace_btrfs_space_reservation(root->fs_info, +				"space_info", space_info->flags, orig_bytes, 1);  			ret = 0;  		} else {  			/* @@ -3431,91 +4482,102 @@ again:  		 * amount plus the amount of bytes that we need for this  		 * reservation.  		 */ -		num_bytes = unused - space_info->total_bytes + -			(orig_bytes * (retries + 1)); +		num_bytes = used - space_info->total_bytes + +			(orig_bytes * 2); +	} + +	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { +		space_info->bytes_may_use += orig_bytes; +		trace_btrfs_space_reservation(root->fs_info, "space_info", +					      space_info->flags, orig_bytes, +					      1); +		ret = 0;  	}  	/*  	 * Couldn't make our reservation, save our place so while we're trying  	 * to reclaim space we can actually use it instead of somebody else  	 * stealing it from us. +	 * +	 * We make the other tasks wait for the flush only when we can flush +	 * all things.  	 */ -	if (ret && !reserved) { -		space_info->bytes_reserved += orig_bytes; -		reserved = true; +	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { +		flushing = true; +		space_info->flush = 1; +	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { +		used += orig_bytes; +		if (need_do_async_reclaim(space_info, root->fs_info, used) && +		    !work_busy(&root->fs_info->async_reclaim_work)) +			queue_work(system_unbound_wq, +				   &root->fs_info->async_reclaim_work);  	} -  	spin_unlock(&space_info->lock); -	if (!ret) -		return 0; - -	if (!flush) +	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)  		goto out; -	/* -	 * We do synchronous shrinking since we don't actually unreserve -	 * metadata until after the IO is completed. -	 */ -	ret = shrink_delalloc(trans, root, num_bytes, 1); -	if (ret > 0) -		return 0; -	else if (ret < 0) -		goto out; +	ret = flush_space(root, space_info, num_bytes, orig_bytes, +			  flush_state); +	flush_state++;  	/* -	 * So if we were overcommitted it's possible that somebody else flushed -	 * out enough space and we simply didn't have enough space to reclaim, -	 * so go back around and try again. +	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock +	 * would happen. So skip delalloc flush.  	 */ -	if (retries < 2) { -		retries++; -		goto again; -	} +	if (flush == BTRFS_RESERVE_FLUSH_LIMIT && +	    (flush_state == FLUSH_DELALLOC || +	     flush_state == FLUSH_DELALLOC_WAIT)) +		flush_state = ALLOC_CHUNK; -	spin_lock(&space_info->lock); -	/* -	 * Not enough space to be reclaimed, don't bother committing the -	 * transaction. -	 */ -	if (space_info->bytes_pinned < orig_bytes) -		ret = -ENOSPC; -	spin_unlock(&space_info->lock); -	if (ret) -		goto out; - -	ret = -EAGAIN; -	if (trans || committed) -		goto out; - -	ret = -ENOSPC; -	trans = btrfs_join_transaction(root, 1); -	if (IS_ERR(trans)) -		goto out; -	ret = btrfs_commit_transaction(trans, root); -	if (!ret) { -		trans = NULL; -		committed = true; +	if (!ret) +		goto again; +	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && +		 flush_state < COMMIT_TRANS) +		goto again; +	else if (flush == BTRFS_RESERVE_FLUSH_ALL && +		 flush_state <= COMMIT_TRANS)  		goto again; -	}  out: -	if (reserved) { +	if (ret == -ENOSPC && +	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { +		struct btrfs_block_rsv *global_rsv = +			&root->fs_info->global_block_rsv; + +		if (block_rsv != global_rsv && +		    !block_rsv_use_bytes(global_rsv, orig_bytes)) +			ret = 0; +	} +	if (ret == -ENOSPC) +		trace_btrfs_space_reservation(root->fs_info, +					      "space_info:enospc", +					      space_info->flags, orig_bytes, 1); +	if (flushing) {  		spin_lock(&space_info->lock); -		space_info->bytes_reserved -= orig_bytes; +		space_info->flush = 0; +		wake_up_all(&space_info->wait);  		spin_unlock(&space_info->lock);  	} -  	return ret;  } -static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, -					     struct btrfs_root *root) +static struct btrfs_block_rsv *get_block_rsv( +					const struct btrfs_trans_handle *trans, +					const struct btrfs_root *root)  { -	struct btrfs_block_rsv *block_rsv; -	if (root->ref_cows) +	struct btrfs_block_rsv *block_rsv = NULL; + +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  		block_rsv = trans->block_rsv; -	else + +	if (root == root->fs_info->csum_root && trans->adding_csums) +		block_rsv = trans->block_rsv; + +	if (root == root->fs_info->uuid_root) +		block_rsv = trans->block_rsv; + +	if (!block_rsv)  		block_rsv = root->block_rsv;  	if (!block_rsv) @@ -3551,8 +4613,34 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,  	spin_unlock(&block_rsv->lock);  } -void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, -			     struct btrfs_block_rsv *dest, u64 num_bytes) +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, +			     struct btrfs_block_rsv *dest, u64 num_bytes, +			     int min_factor) +{ +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; +	u64 min_bytes; + +	if (global_rsv->space_info != dest->space_info) +		return -ENOSPC; + +	spin_lock(&global_rsv->lock); +	min_bytes = div_factor(global_rsv->size, min_factor); +	if (global_rsv->reserved < min_bytes + num_bytes) { +		spin_unlock(&global_rsv->lock); +		return -ENOSPC; +	} +	global_rsv->reserved -= num_bytes; +	if (global_rsv->reserved < global_rsv->size) +		global_rsv->full = 0; +	spin_unlock(&global_rsv->lock); + +	block_rsv_add_bytes(dest, num_bytes, 1); +	return 0; +} + +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, +				    struct btrfs_block_rsv *block_rsv, +				    struct btrfs_block_rsv *dest, u64 num_bytes)  {  	struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3571,10 +4659,24 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,  	if (num_bytes > 0) {  		if (dest) { -			block_rsv_add_bytes(dest, num_bytes, 0); -		} else { +			spin_lock(&dest->lock); +			if (!dest->full) { +				u64 bytes_to_add; + +				bytes_to_add = dest->size - dest->reserved; +				bytes_to_add = min(num_bytes, bytes_to_add); +				dest->reserved += bytes_to_add; +				if (dest->reserved >= dest->size) +					dest->full = 1; +				num_bytes -= bytes_to_add; +			} +			spin_unlock(&dest->lock); +		} +		if (num_bytes) {  			spin_lock(&space_info->lock); -			space_info->bytes_reserved -= num_bytes; +			space_info->bytes_may_use -= num_bytes; +			trace_btrfs_space_reservation(fs_info, "space_info", +					space_info->flags, num_bytes, 0);  			spin_unlock(&space_info->lock);  		}  	} @@ -3593,16 +4695,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,  	return 0;  } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)  {  	memset(rsv, 0, sizeof(*rsv));  	spin_lock_init(&rsv->lock); -	atomic_set(&rsv->usage, 1); -	rsv->priority = 6; -	INIT_LIST_HEAD(&rsv->list); +	rsv->type = type;  } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, +					      unsigned short type)  {  	struct btrfs_block_rsv *block_rsv;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -3611,7 +4712,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)  	if (!block_rsv)  		return NULL; -	btrfs_init_block_rsv(block_rsv); +	btrfs_init_block_rsv(block_rsv, type);  	block_rsv->space_info = __find_space_info(fs_info,  						  BTRFS_BLOCK_GROUP_METADATA);  	return block_rsv; @@ -3620,38 +4721,22 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv)  { -	if (rsv && atomic_dec_and_test(&rsv->usage)) { -		btrfs_block_rsv_release(root, rsv, (u64)-1); -		if (!rsv->durable) -			kfree(rsv); -	} -} - -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, -				 struct btrfs_block_rsv *block_rsv) -{ -	block_rsv->durable = 1; -	mutex_lock(&fs_info->durable_block_rsv_mutex); -	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); -	mutex_unlock(&fs_info->durable_block_rsv_mutex); +	if (!rsv) +		return; +	btrfs_block_rsv_release(root, rsv, (u64)-1); +	kfree(rsv);  } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, -			struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, -			u64 num_bytes) +int btrfs_block_rsv_add(struct btrfs_root *root, +			struct btrfs_block_rsv *block_rsv, u64 num_bytes, +			enum btrfs_reserve_flush_enum flush)  {  	int ret;  	if (num_bytes == 0)  		return 0; -	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);  	if (!ret) {  		block_rsv_add_bytes(block_rsv, num_bytes, 1);  		return 0; @@ -3660,61 +4745,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,  	return ret;  } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, -			  struct btrfs_root *root, -			  struct btrfs_block_rsv *block_rsv, -			  u64 min_reserved, int min_factor) +int btrfs_block_rsv_check(struct btrfs_root *root, +			  struct btrfs_block_rsv *block_rsv, int min_factor)  {  	u64 num_bytes = 0; -	int commit_trans = 0;  	int ret = -ENOSPC;  	if (!block_rsv)  		return 0;  	spin_lock(&block_rsv->lock); -	if (min_factor > 0) -		num_bytes = div_factor(block_rsv->size, min_factor); -	if (min_reserved > num_bytes) -		num_bytes = min_reserved; +	num_bytes = div_factor(block_rsv->size, min_factor); +	if (block_rsv->reserved >= num_bytes) +		ret = 0; +	spin_unlock(&block_rsv->lock); -	if (block_rsv->reserved >= num_bytes) { +	return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, +			   struct btrfs_block_rsv *block_rsv, u64 min_reserved, +			   enum btrfs_reserve_flush_enum flush) +{ +	u64 num_bytes = 0; +	int ret = -ENOSPC; + +	if (!block_rsv) +		return 0; + +	spin_lock(&block_rsv->lock); +	num_bytes = min_reserved; +	if (block_rsv->reserved >= num_bytes)  		ret = 0; -	} else { +	else  		num_bytes -= block_rsv->reserved; -		if (block_rsv->durable && -		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) -			commit_trans = 1; -	}  	spin_unlock(&block_rsv->lock); +  	if (!ret)  		return 0; -	if (block_rsv->refill_used) { -		ret = reserve_metadata_bytes(trans, root, block_rsv, -					     num_bytes, 0); -		if (!ret) { -			block_rsv_add_bytes(block_rsv, num_bytes, 0); -			return 0; -		} -	} - -	if (commit_trans) { -		if (trans) -			return -EAGAIN; - -		trans = btrfs_join_transaction(root, 1); -		BUG_ON(IS_ERR(trans)); -		ret = btrfs_commit_transaction(trans, root); +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); +	if (!ret) { +		block_rsv_add_bytes(block_rsv, num_bytes, 0);  		return 0;  	} -	WARN_ON(1); -	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", -		block_rsv->size, block_rsv->reserved, -		block_rsv->freed[0], block_rsv->freed[1]); - -	return -ENOSPC; +	return ret;  }  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, @@ -3729,10 +4805,11 @@ void btrfs_block_rsv_release(struct btrfs_root *root,  			     u64 num_bytes)  {  	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; -	if (global_rsv->full || global_rsv == block_rsv || +	if (global_rsv == block_rsv ||  	    block_rsv->space_info != global_rsv->space_info)  		global_rsv = NULL; -	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); +	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, +				num_bytes);  }  /* @@ -3746,24 +4823,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)  	u64 num_bytes;  	u64 meta_used;  	u64 data_used; -	int csum_size = btrfs_super_csum_size(&fs_info->super_copy); -#if 0 -	/* -	 * per tree used space accounting can be inaccuracy, so we -	 * can't rely on it. -	 */ -	spin_lock(&fs_info->extent_root->accounting_lock); -	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); -	spin_unlock(&fs_info->extent_root->accounting_lock); - -	spin_lock(&fs_info->csum_root->accounting_lock); -	num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); -	spin_unlock(&fs_info->csum_root->accounting_lock); +	int csum_size = btrfs_super_csum_size(fs_info->super_copy); -	spin_lock(&fs_info->tree_root->accounting_lock); -	num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); -	spin_unlock(&fs_info->tree_root->accounting_lock); -#endif  	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);  	spin_lock(&sinfo->lock);  	data_used = sinfo->bytes_used; @@ -3794,10 +4855,10 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  	num_bytes = calc_global_metadata_size(fs_info); -	spin_lock(&block_rsv->lock);  	spin_lock(&sinfo->lock); +	spin_lock(&block_rsv->lock); -	block_rsv->size = num_bytes; +	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);  	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +  		    sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -3806,21 +4867,22 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  	if (sinfo->total_bytes > num_bytes) {  		num_bytes = sinfo->total_bytes - num_bytes;  		block_rsv->reserved += num_bytes; -		sinfo->bytes_reserved += num_bytes; +		sinfo->bytes_may_use += num_bytes; +		trace_btrfs_space_reservation(fs_info, "space_info", +				      sinfo->flags, num_bytes, 1);  	}  	if (block_rsv->reserved >= block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size; -		sinfo->bytes_reserved -= num_bytes; +		sinfo->bytes_may_use -= num_bytes; +		trace_btrfs_space_reservation(fs_info, "space_info", +				      sinfo->flags, num_bytes, 0);  		block_rsv->reserved = block_rsv->size;  		block_rsv->full = 1;  	} -#if 0 -	printk(KERN_INFO"global block rsv size %llu reserved %llu\n", -		block_rsv->size, block_rsv->reserved); -#endif -	spin_unlock(&sinfo->lock); +  	spin_unlock(&block_rsv->lock); +	spin_unlock(&sinfo->lock);  }  static void init_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -3829,79 +4891,55 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);  	fs_info->chunk_block_rsv.space_info = space_info; -	fs_info->chunk_block_rsv.priority = 10;  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);  	fs_info->global_block_rsv.space_info = space_info; -	fs_info->global_block_rsv.priority = 10; -	fs_info->global_block_rsv.refill_used = 1;  	fs_info->delalloc_block_rsv.space_info = space_info;  	fs_info->trans_block_rsv.space_info = space_info;  	fs_info->empty_block_rsv.space_info = space_info; -	fs_info->empty_block_rsv.priority = 10; +	fs_info->delayed_block_rsv.space_info = space_info;  	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; +	if (fs_info->quota_root) +		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;  	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; -	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - -	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); -  	update_global_block_rsv(fs_info);  }  static void release_global_block_rsv(struct btrfs_fs_info *fs_info)  { -	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); +	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, +				(u64)-1);  	WARN_ON(fs_info->delalloc_block_rsv.size > 0);  	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);  	WARN_ON(fs_info->trans_block_rsv.size > 0);  	WARN_ON(fs_info->trans_block_rsv.reserved > 0);  	WARN_ON(fs_info->chunk_block_rsv.size > 0);  	WARN_ON(fs_info->chunk_block_rsv.reserved > 0); -} - -static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) -{ -	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * -		3 * num_items; -} - -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root, -				 int num_items) -{ -	u64 num_bytes; -	int ret; - -	if (num_items == 0 || root->fs_info->chunk_root == root) -		return 0; - -	num_bytes = calc_trans_metadata_size(root, num_items); -	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, -				  num_bytes); -	if (!ret) { -		trans->bytes_reserved += num_bytes; -		trans->block_rsv = &root->fs_info->trans_block_rsv; -	} -	return ret; +	WARN_ON(fs_info->delayed_block_rsv.size > 0); +	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);  }  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  				  struct btrfs_root *root)  { +	if (!trans->block_rsv) +		return; +  	if (!trans->bytes_reserved)  		return; -	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); -	btrfs_block_rsv_release(root, trans->block_rsv, -				trans->bytes_reserved); +	trace_btrfs_space_reservation(root->fs_info, "transaction", +				      trans->transid, trans->bytes_reserved, 0); +	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);  	trans->bytes_reserved = 0;  } +/* Can only return 0 or -ENOSPC */  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,  				  struct inode *inode)  { @@ -3910,115 +4948,386 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,  	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;  	/* -	 * one for deleting orphan item, one for updating inode and -	 * two for calling btrfs_truncate_inode_items. -	 * -	 * btrfs_truncate_inode_items is a delete operation, it frees -	 * more space than it uses in most cases. So two units of -	 * metadata space should be enough for calling it many times. -	 * If all of the metadata space is used, we can commit -	 * transaction and use space it freed. +	 * We need to hold space in order to delete our orphan item once we've +	 * added it, so this takes the reservation so we can release it later +	 * when we are truly done with the orphan item.  	 */ -	u64 num_bytes = calc_trans_metadata_size(root, 4); +	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); +	trace_btrfs_space_reservation(root->fs_info, "orphan", +				      btrfs_ino(inode), num_bytes, 1);  	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);  }  void btrfs_orphan_release_metadata(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	u64 num_bytes = calc_trans_metadata_size(root, 4); +	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); +	trace_btrfs_space_reservation(root->fs_info, "orphan", +				      btrfs_ino(inode), num_bytes, 0);  	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);  } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, +				     struct btrfs_block_rsv *rsv, +				     int items, +				     u64 *qgroup_reserved, +				     bool use_global_rsv)  { -	struct btrfs_root *root = pending->root; -	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); -	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; -	/* -	 * two for root back/forward refs, two for directory entries -	 * and one for root of the snapshot. -	 */ -	u64 num_bytes = calc_trans_metadata_size(root, 5); -	dst_rsv->space_info = src_rsv->space_info; -	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +	u64 num_bytes; +	int ret; +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + +	if (root->fs_info->quota_enabled) { +		/* One for parent inode, two for dir entries */ +		num_bytes = 3 * root->leafsize; +		ret = btrfs_qgroup_reserve(root, num_bytes); +		if (ret) +			return ret; +	} else { +		num_bytes = 0; +	} + +	*qgroup_reserved = num_bytes; + +	num_bytes = btrfs_calc_trans_metadata_size(root, items); +	rsv->space_info = __find_space_info(root->fs_info, +					    BTRFS_BLOCK_GROUP_METADATA); +	ret = btrfs_block_rsv_add(root, rsv, num_bytes, +				  BTRFS_RESERVE_FLUSH_ALL); + +	if (ret == -ENOSPC && use_global_rsv) +		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + +	if (ret) { +		if (*qgroup_reserved) +			btrfs_qgroup_free(root, *qgroup_reserved); +	} + +	return ret;  } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +void btrfs_subvolume_release_metadata(struct btrfs_root *root, +				      struct btrfs_block_rsv *rsv, +				      u64 qgroup_reserved)  { -	return num_bytes >>= 3; +	btrfs_block_rsv_release(root, rsv, (u64)-1); +	if (qgroup_reserved) +		btrfs_qgroup_free(root, qgroup_reserved); +} + +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written.  This will return the number of + * reserved extents that need to be freed.  This must be called with + * BTRFS_I(inode)->lock held. + */ +static unsigned drop_outstanding_extent(struct inode *inode) +{ +	unsigned drop_inode_space = 0; +	unsigned dropped_extents = 0; + +	BUG_ON(!BTRFS_I(inode)->outstanding_extents); +	BTRFS_I(inode)->outstanding_extents--; + +	if (BTRFS_I(inode)->outstanding_extents == 0 && +	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +			       &BTRFS_I(inode)->runtime_flags)) +		drop_inode_space = 1; + +	/* +	 * If we have more or the same amount of outsanding extents than we have +	 * reserved then we need to leave the reserved extents count alone. +	 */ +	if (BTRFS_I(inode)->outstanding_extents >= +	    BTRFS_I(inode)->reserved_extents) +		return drop_inode_space; + +	dropped_extents = BTRFS_I(inode)->reserved_extents - +		BTRFS_I(inode)->outstanding_extents; +	BTRFS_I(inode)->reserved_extents -= dropped_extents; +	return dropped_extents + drop_inode_space; +} + +/** + * calc_csum_metadata_size - return the amount of metada space that must be + *	reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed.  We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required.  If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved.  If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, +				   int reserve) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	u64 csum_size; +	int num_csums_per_leaf; +	int num_csums; +	int old_csums; + +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && +	    BTRFS_I(inode)->csum_bytes == 0) +		return 0; + +	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); +	if (reserve) +		BTRFS_I(inode)->csum_bytes += num_bytes; +	else +		BTRFS_I(inode)->csum_bytes -= num_bytes; +	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +	num_csums_per_leaf = (int)div64_u64(csum_size, +					    sizeof(struct btrfs_csum_item) + +					    sizeof(struct btrfs_disk_key)); +	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); +	num_csums = num_csums + num_csums_per_leaf - 1; +	num_csums = num_csums / num_csums_per_leaf; + +	old_csums = old_csums + num_csums_per_leaf - 1; +	old_csums = old_csums / num_csums_per_leaf; + +	/* No change, no need to reserve more */ +	if (old_csums == num_csums) +		return 0; + +	if (reserve) +		return btrfs_calc_trans_metadata_size(root, +						      num_csums - old_csums); + +	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);  }  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; -	u64 to_reserve; -	int nr_extents; -	int ret; +	u64 to_reserve = 0; +	u64 csum_bytes; +	unsigned nr_extents = 0; +	int extra_reserve = 0; +	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; +	int ret = 0; +	bool delalloc_lock = true; +	u64 to_free = 0; +	unsigned dropped; + +	/* If we are a free space inode we need to not flush since we will be in +	 * the middle of a transaction commit.  We also don't need the delalloc +	 * mutex since we won't race with anybody.  We need this mostly to make +	 * lockdep shut its filthy mouth. +	 */ +	if (btrfs_is_free_space_inode(inode)) { +		flush = BTRFS_RESERVE_NO_FLUSH; +		delalloc_lock = false; +	} -	if (btrfs_transaction_in_commit(root->fs_info)) +	if (flush != BTRFS_RESERVE_NO_FLUSH && +	    btrfs_transaction_in_commit(root->fs_info))  		schedule_timeout(1); +	if (delalloc_lock) +		mutex_lock(&BTRFS_I(inode)->delalloc_mutex); +  	num_bytes = ALIGN(num_bytes, root->sectorsize); -	spin_lock(&BTRFS_I(inode)->accounting_lock); -	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; -	if (nr_extents > BTRFS_I(inode)->reserved_extents) { -		nr_extents -= BTRFS_I(inode)->reserved_extents; -		to_reserve = calc_trans_metadata_size(root, nr_extents); -	} else { -		nr_extents = 0; -		to_reserve = 0; +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents++; + +	if (BTRFS_I(inode)->outstanding_extents > +	    BTRFS_I(inode)->reserved_extents) +		nr_extents = BTRFS_I(inode)->outstanding_extents - +			BTRFS_I(inode)->reserved_extents; + +	/* +	 * Add an item to reserve for updating the inode when we complete the +	 * delalloc io. +	 */ +	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +		      &BTRFS_I(inode)->runtime_flags)) { +		nr_extents++; +		extra_reserve = 1;  	} -	spin_unlock(&BTRFS_I(inode)->accounting_lock); -	to_reserve += calc_csum_metadata_size(inode, num_bytes); -	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); -	if (ret) -		return ret; +	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); +	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); +	csum_bytes = BTRFS_I(inode)->csum_bytes; +	spin_unlock(&BTRFS_I(inode)->lock); + +	if (root->fs_info->quota_enabled) { +		ret = btrfs_qgroup_reserve(root, num_bytes + +					   nr_extents * root->leafsize); +		if (ret) +			goto out_fail; +	} -	spin_lock(&BTRFS_I(inode)->accounting_lock); +	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); +	if (unlikely(ret)) { +		if (root->fs_info->quota_enabled) +			btrfs_qgroup_free(root, num_bytes + +						nr_extents * root->leafsize); +		goto out_fail; +	} + +	spin_lock(&BTRFS_I(inode)->lock); +	if (extra_reserve) { +		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, +			&BTRFS_I(inode)->runtime_flags); +		nr_extents--; +	}  	BTRFS_I(inode)->reserved_extents += nr_extents; -	atomic_inc(&BTRFS_I(inode)->outstanding_extents); -	spin_unlock(&BTRFS_I(inode)->accounting_lock); +	spin_unlock(&BTRFS_I(inode)->lock); -	block_rsv_add_bytes(block_rsv, to_reserve, 1); +	if (delalloc_lock) +		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); -	if (block_rsv->size > 512 * 1024 * 1024) -		shrink_delalloc(NULL, root, to_reserve, 0); +	if (to_reserve) +		trace_btrfs_space_reservation(root->fs_info, "delalloc", +					      btrfs_ino(inode), to_reserve, 1); +	block_rsv_add_bytes(block_rsv, to_reserve, 1);  	return 0; + +out_fail: +	spin_lock(&BTRFS_I(inode)->lock); +	dropped = drop_outstanding_extent(inode); +	/* +	 * If the inodes csum_bytes is the same as the original +	 * csum_bytes then we know we haven't raced with any free()ers +	 * so we can just reduce our inodes csum bytes and carry on. +	 */ +	if (BTRFS_I(inode)->csum_bytes == csum_bytes) { +		calc_csum_metadata_size(inode, num_bytes, 0); +	} else { +		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; +		u64 bytes; + +		/* +		 * This is tricky, but first we need to figure out how much we +		 * free'd from any free-ers that occured during this +		 * reservation, so we reset ->csum_bytes to the csum_bytes +		 * before we dropped our lock, and then call the free for the +		 * number of bytes that were freed while we were trying our +		 * reservation. +		 */ +		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; +		BTRFS_I(inode)->csum_bytes = csum_bytes; +		to_free = calc_csum_metadata_size(inode, bytes, 0); + + +		/* +		 * Now we need to see how much we would have freed had we not +		 * been making this reservation and our ->csum_bytes were not +		 * artificially inflated. +		 */ +		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; +		bytes = csum_bytes - orig_csum_bytes; +		bytes = calc_csum_metadata_size(inode, bytes, 0); + +		/* +		 * Now reset ->csum_bytes to what it should be.  If bytes is +		 * more than to_free then we would have free'd more space had we +		 * not had an artificially high ->csum_bytes, so we need to free +		 * the remainder.  If bytes is the same or less then we don't +		 * need to do anything, the other free-ers did the correct +		 * thing. +		 */ +		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; +		if (bytes > to_free) +			to_free = bytes - to_free; +		else +			to_free = 0; +	} +	spin_unlock(&BTRFS_I(inode)->lock); +	if (dropped) +		to_free += btrfs_calc_trans_metadata_size(root, dropped); + +	if (to_free) { +		btrfs_block_rsv_release(root, block_rsv, to_free); +		trace_btrfs_space_reservation(root->fs_info, "delalloc", +					      btrfs_ino(inode), to_free, 0); +	} +	if (delalloc_lock) +		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); +	return ret;  } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode.  This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	u64 to_free; -	int nr_extents; +	u64 to_free = 0; +	unsigned dropped;  	num_bytes = ALIGN(num_bytes, root->sectorsize); -	atomic_dec(&BTRFS_I(inode)->outstanding_extents); +	spin_lock(&BTRFS_I(inode)->lock); +	dropped = drop_outstanding_extent(inode); -	spin_lock(&BTRFS_I(inode)->accounting_lock); -	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); -	if (nr_extents < BTRFS_I(inode)->reserved_extents) { -		nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; -		BTRFS_I(inode)->reserved_extents -= nr_extents; -	} else { -		nr_extents = 0; -	} -	spin_unlock(&BTRFS_I(inode)->accounting_lock); +	if (num_bytes) +		to_free = calc_csum_metadata_size(inode, num_bytes, 0); +	spin_unlock(&BTRFS_I(inode)->lock); +	if (dropped > 0) +		to_free += btrfs_calc_trans_metadata_size(root, dropped); -	to_free = calc_csum_metadata_size(inode, num_bytes); -	if (nr_extents > 0) -		to_free += calc_trans_metadata_size(root, nr_extents); +	trace_btrfs_space_reservation(root->fs_info, "delalloc", +				      btrfs_ino(inode), to_free, 0); +	if (root->fs_info->quota_enabled) { +		btrfs_qgroup_free(root, num_bytes + +					dropped * root->leafsize); +	}  	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,  				to_free);  } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + *   extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */  int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)  {  	int ret; @@ -4036,14 +5345,26 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)  	return 0;  } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space.  This is + * called in the case that we don't need the metadata AND data reservations + * anymore.  So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */  void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)  {  	btrfs_delalloc_release_metadata(inode, num_bytes);  	btrfs_free_reserved_data_space(inode, num_bytes);  } -static int update_block_group(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root,  			      u64 bytenr, u64 num_bytes, int alloc)  {  	struct btrfs_block_group_cache *cache = NULL; @@ -4054,19 +5375,19 @@ static int update_block_group(struct btrfs_trans_handle *trans,  	int factor;  	/* block accounting for super block */ -	spin_lock(&info->delalloc_lock); -	old_val = btrfs_super_bytes_used(&info->super_copy); +	spin_lock(&info->delalloc_root_lock); +	old_val = btrfs_super_bytes_used(info->super_copy);  	if (alloc)  		old_val += num_bytes;  	else  		old_val -= num_bytes; -	btrfs_set_super_bytes_used(&info->super_copy, old_val); -	spin_unlock(&info->delalloc_lock); +	btrfs_set_super_bytes_used(info->super_copy, old_val); +	spin_unlock(&info->delalloc_root_lock);  	while (total) {  		cache = btrfs_lookup_block_group(info, bytenr);  		if (!cache) -			return -1; +			return -ENOENT;  		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |  				    BTRFS_BLOCK_GROUP_RAID1 |  				    BTRFS_BLOCK_GROUP_RAID10)) @@ -4080,7 +5401,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  		 * space back to the block group, otherwise we will leak space.  		 */  		if (!alloc && cache->cached == BTRFS_CACHE_NO) -			cache_block_group(cache, trans, 1); +			cache_block_group(cache, 1);  		byte_in_group = bytenr - cache->key.objectid;  		WARN_ON(byte_in_group > cache->key.offset); @@ -4088,7 +5409,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,  		spin_lock(&cache->space_info->lock);  		spin_lock(&cache->lock); -		if (btrfs_super_cache_generation(&info->super_copy) != 0 && +		if (btrfs_test_opt(root, SPACE_CACHE) &&  		    cache->disk_cache_state < BTRFS_DC_CLEAR)  			cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -4130,6 +5451,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)  	struct btrfs_block_group_cache *cache;  	u64 bytenr; +	spin_lock(&root->fs_info->block_group_cache_lock); +	bytenr = root->fs_info->first_logical_byte; +	spin_unlock(&root->fs_info->block_group_cache_lock); + +	if (bytenr < (u64)-1) +		return bytenr; +  	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);  	if (!cache)  		return 0; @@ -4157,6 +5485,8 @@ static int pin_down_extent(struct btrfs_root *root,  	set_extent_dirty(root->fs_info->pinned_extents, bytenr,  			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); +	if (reserved) +		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);  	return 0;  } @@ -4169,7 +5499,7 @@ int btrfs_pin_extent(struct btrfs_root *root,  	struct btrfs_block_group_cache *cache;  	cache = btrfs_lookup_block_group(root->fs_info, bytenr); -	BUG_ON(!cache); +	BUG_ON(!cache); /* Logic error */  	pin_down_extent(root, cache, bytenr, num_bytes, reserved); @@ -4178,48 +5508,170 @@ int btrfs_pin_extent(struct btrfs_root *root,  }  /* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. + * this function must be called within transaction   */ -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, -				 u64 num_bytes, int reserve, int sinfo) +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, +				    u64 bytenr, u64 num_bytes)  { -	int ret = 0; -	if (sinfo) { -		struct btrfs_space_info *space_info = cache->space_info; -		spin_lock(&space_info->lock); -		spin_lock(&cache->lock); -		if (reserve) { -			if (cache->ro) { -				ret = -EAGAIN; -			} else { -				cache->reserved += num_bytes; -				space_info->bytes_reserved += num_bytes; -			} +	struct btrfs_block_group_cache *cache; +	int ret; + +	cache = btrfs_lookup_block_group(root->fs_info, bytenr); +	if (!cache) +		return -EINVAL; + +	/* +	 * pull in the free space cache (if any) so that our pin +	 * removes the free space from the cache.  We have load_only set +	 * to one because the slow code to read in the free extents does check +	 * the pinned extents. +	 */ +	cache_block_group(cache, 1); + +	pin_down_extent(root, cache, bytenr, num_bytes, 0); + +	/* remove us from the free space cache (if we're there at all) */ +	ret = btrfs_remove_free_space(cache, bytenr, num_bytes); +	btrfs_put_block_group(cache); +	return ret; +} + +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ +	int ret; +	struct btrfs_block_group_cache *block_group; +	struct btrfs_caching_control *caching_ctl; + +	block_group = btrfs_lookup_block_group(root->fs_info, start); +	if (!block_group) +		return -EINVAL; + +	cache_block_group(block_group, 0); +	caching_ctl = get_caching_control(block_group); + +	if (!caching_ctl) { +		/* Logic error */ +		BUG_ON(!block_group_cache_done(block_group)); +		ret = btrfs_remove_free_space(block_group, start, num_bytes); +	} else { +		mutex_lock(&caching_ctl->mutex); + +		if (start >= caching_ctl->progress) { +			ret = add_excluded_extent(root, start, num_bytes); +		} else if (start + num_bytes <= caching_ctl->progress) { +			ret = btrfs_remove_free_space(block_group, +						      start, num_bytes);  		} else { -			if (cache->ro) -				space_info->bytes_readonly += num_bytes; -			cache->reserved -= num_bytes; -			space_info->bytes_reserved -= num_bytes; +			num_bytes = caching_ctl->progress - start; +			ret = btrfs_remove_free_space(block_group, +						      start, num_bytes); +			if (ret) +				goto out_lock; + +			num_bytes = (start + num_bytes) - +				caching_ctl->progress; +			start = caching_ctl->progress; +			ret = add_excluded_extent(root, start, num_bytes);  		} -		spin_unlock(&cache->lock); -		spin_unlock(&space_info->lock); -	} else { -		spin_lock(&cache->lock); +out_lock: +		mutex_unlock(&caching_ctl->mutex); +		put_caching_control(caching_ctl); +	} +	btrfs_put_block_group(block_group); +	return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, +				 struct extent_buffer *eb) +{ +	struct btrfs_file_extent_item *item; +	struct btrfs_key key; +	int found_type; +	int i; + +	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) +		return 0; + +	for (i = 0; i < btrfs_header_nritems(eb); i++) { +		btrfs_item_key_to_cpu(eb, &key, i); +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			continue; +		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); +		found_type = btrfs_file_extent_type(eb, item); +		if (found_type == BTRFS_FILE_EXTENT_INLINE) +			continue; +		if (btrfs_file_extent_disk_bytenr(eb, item) == 0) +			continue; +		key.objectid = btrfs_file_extent_disk_bytenr(eb, item); +		key.offset = btrfs_file_extent_disk_num_bytes(eb, item); +		__exclude_logged_extent(log, key.objectid, key.offset); +	} + +	return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache:	The cache we are manipulating + * @num_bytes:	The number of bytes in question + * @reserve:	One of the reservation enums + * @delalloc:   The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk.  For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting.  For data we handle the reservation through clearing the + * delalloc bits in the io_tree.  We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. + */ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, +				       u64 num_bytes, int reserve, int delalloc) +{ +	struct btrfs_space_info *space_info = cache->space_info; +	int ret = 0; + +	spin_lock(&space_info->lock); +	spin_lock(&cache->lock); +	if (reserve != RESERVE_FREE) {  		if (cache->ro) {  			ret = -EAGAIN;  		} else { -			if (reserve) -				cache->reserved += num_bytes; -			else -				cache->reserved -= num_bytes; +			cache->reserved += num_bytes; +			space_info->bytes_reserved += num_bytes; +			if (reserve == RESERVE_ALLOC) { +				trace_btrfs_space_reservation(cache->fs_info, +						"space_info", space_info->flags, +						num_bytes, 0); +				space_info->bytes_may_use -= num_bytes; +			} + +			if (delalloc) +				cache->delalloc_bytes += num_bytes;  		} -		spin_unlock(&cache->lock); +	} else { +		if (cache->ro) +			space_info->bytes_readonly += num_bytes; +		cache->reserved -= num_bytes; +		space_info->bytes_reserved -= num_bytes; + +		if (delalloc) +			cache->delalloc_bytes -= num_bytes;  	} +	spin_unlock(&cache->lock); +	spin_unlock(&space_info->lock);  	return ret;  } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  				struct btrfs_root *root)  {  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -4227,7 +5679,7 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  	struct btrfs_caching_control *caching_ctl;  	struct btrfs_block_group_cache *cache; -	down_write(&fs_info->extent_commit_sem); +	down_write(&fs_info->commit_root_sem);  	list_for_each_entry_safe(caching_ctl, next,  				 &fs_info->caching_block_groups, list) { @@ -4246,25 +5698,28 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  	else  		fs_info->pinned_extents = &fs_info->freed_extents[0]; -	up_write(&fs_info->extent_commit_sem); +	up_write(&fs_info->commit_root_sem);  	update_global_block_rsv(fs_info); -	return 0;  }  static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_block_group_cache *cache = NULL; +	struct btrfs_space_info *space_info; +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;  	u64 len; +	bool readonly;  	while (start <= end) { +		readonly = false;  		if (!cache ||  		    start >= cache->key.objectid + cache->key.offset) {  			if (cache)  				btrfs_put_block_group(cache);  			cache = btrfs_lookup_block_group(fs_info, start); -			BUG_ON(!cache); +			BUG_ON(!cache); /* Logic error */  		}  		len = cache->key.objectid + cache->key.offset - start; @@ -4276,20 +5731,31 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  		}  		start += len; +		space_info = cache->space_info; -		spin_lock(&cache->space_info->lock); +		spin_lock(&space_info->lock);  		spin_lock(&cache->lock);  		cache->pinned -= len; -		cache->space_info->bytes_pinned -= len; +		space_info->bytes_pinned -= len; +		percpu_counter_add(&space_info->total_bytes_pinned, -len);  		if (cache->ro) { -			cache->space_info->bytes_readonly += len; -		} else if (cache->reserved_pinned > 0) { -			len = min(len, cache->reserved_pinned); -			cache->reserved_pinned -= len; -			cache->space_info->bytes_reserved += len; +			space_info->bytes_readonly += len; +			readonly = true;  		}  		spin_unlock(&cache->lock); -		spin_unlock(&cache->space_info->lock); +		if (!readonly && global_rsv->space_info == space_info) { +			spin_lock(&global_rsv->lock); +			if (!global_rsv->full) { +				len = min(len, global_rsv->size - +					  global_rsv->reserved); +				global_rsv->reserved += len; +				space_info->bytes_may_use += len; +				if (global_rsv->reserved >= global_rsv->size) +					global_rsv->full = 1; +			} +			spin_unlock(&global_rsv->lock); +		} +		spin_unlock(&space_info->lock);  	}  	if (cache) @@ -4302,13 +5768,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct extent_io_tree *unpin; -	struct btrfs_block_rsv *block_rsv; -	struct btrfs_block_rsv *next_rsv;  	u64 start;  	u64 end; -	int idx;  	int ret; +	if (trans->aborted) +		return 0; +  	if (fs_info->pinned_extents == &fs_info->freed_extents[0])  		unpin = &fs_info->freed_extents[1];  	else @@ -4316,50 +5782,50 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  	while (1) {  		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY, NULL);  		if (ret)  			break; -		ret = btrfs_discard_extent(root, start, end + 1 - start); +		if (btrfs_test_opt(root, DISCARD)) +			ret = btrfs_discard_extent(root, start, +						   end + 1 - start, NULL);  		clear_extent_dirty(unpin, start, end, GFP_NOFS);  		unpin_extent_range(root, start, end);  		cond_resched();  	} -	mutex_lock(&fs_info->durable_block_rsv_mutex); -	list_for_each_entry_safe(block_rsv, next_rsv, -				 &fs_info->durable_block_rsv_list, list) { +	return 0; +} -		idx = trans->transid & 0x1; -		if (block_rsv->freed[idx] > 0) { -			block_rsv_add_bytes(block_rsv, -					    block_rsv->freed[idx], 0); -			block_rsv->freed[idx] = 0; -		} -		if (atomic_read(&block_rsv->usage) == 0) { -			btrfs_block_rsv_release(root, block_rsv, (u64)-1); +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, +			     u64 owner, u64 root_objectid) +{ +	struct btrfs_space_info *space_info; +	u64 flags; -			if (block_rsv->freed[0] == 0 && -			    block_rsv->freed[1] == 0) { -				list_del_init(&block_rsv->list); -				kfree(block_rsv); -			} -		} else { -			btrfs_block_rsv_release(root, block_rsv, 0); -		} +	if (owner < BTRFS_FIRST_FREE_OBJECTID) { +		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) +			flags = BTRFS_BLOCK_GROUP_SYSTEM; +		else +			flags = BTRFS_BLOCK_GROUP_METADATA; +	} else { +		flags = BTRFS_BLOCK_GROUP_DATA;  	} -	mutex_unlock(&fs_info->durable_block_rsv_mutex); -	return 0; +	space_info = __find_space_info(fs_info, flags); +	BUG_ON(!space_info); /* Logic bug */ +	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);  } +  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				struct btrfs_root *root,  				u64 bytenr, u64 num_bytes, u64 parent,  				u64 root_objectid, u64 owner_objectid,  				u64 owner_offset, int refs_to_drop, -				struct btrfs_delayed_extent_op *extent_op) +				struct btrfs_delayed_extent_op *extent_op, +				int no_quota)  {  	struct btrfs_key key;  	struct btrfs_path *path; @@ -4375,6 +5841,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	int num_to_del = 1;  	u32 item_size;  	u64 refs; +	int last_ref = 0; +	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; +	bool skinny_metadata = btrfs_fs_incompat(root->fs_info, +						 SKINNY_METADATA); + +	if (!info->quota_enabled || !is_fstree(root_objectid)) +		no_quota = 1;  	path = btrfs_alloc_path();  	if (!path) @@ -4386,6 +5859,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;  	BUG_ON(!is_data && refs_to_drop != 1); +	if (is_data) +		skinny_metadata = 0; +  	ret = lookup_extent_backref(trans, extent_root, path, &iref,  				    bytenr, num_bytes, parent,  				    root_objectid, owner_objectid, @@ -4402,6 +5878,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  				found_extent = 1;  				break;  			} +			if (key.type == BTRFS_METADATA_ITEM_KEY && +			    key.offset == owner_objectid) { +				found_extent = 1; +				break; +			}  			if (path->slots[0] - extent_slot > 5)  				break;  			extent_slot--; @@ -4415,36 +5896,73 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			BUG_ON(iref);  			ret = remove_extent_backref(trans, extent_root, path,  						    NULL, refs_to_drop, -						    is_data); -			BUG_ON(ret); -			btrfs_release_path(extent_root, path); +						    is_data, &last_ref); +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			} +			btrfs_release_path(path);  			path->leave_spinning = 1;  			key.objectid = bytenr;  			key.type = BTRFS_EXTENT_ITEM_KEY;  			key.offset = num_bytes; +			if (!is_data && skinny_metadata) { +				key.type = BTRFS_METADATA_ITEM_KEY; +				key.offset = owner_objectid; +			} +  			ret = btrfs_search_slot(trans, extent_root,  						&key, path, -1, 1); +			if (ret > 0 && skinny_metadata && path->slots[0]) { +				/* +				 * Couldn't find our skinny metadata item, +				 * see if we have ye olde extent item. +				 */ +				path->slots[0]--; +				btrfs_item_key_to_cpu(path->nodes[0], &key, +						      path->slots[0]); +				if (key.objectid == bytenr && +				    key.type == BTRFS_EXTENT_ITEM_KEY && +				    key.offset == num_bytes) +					ret = 0; +			} + +			if (ret > 0 && skinny_metadata) { +				skinny_metadata = false; +				key.objectid = bytenr; +				key.type = BTRFS_EXTENT_ITEM_KEY; +				key.offset = num_bytes; +				btrfs_release_path(path); +				ret = btrfs_search_slot(trans, extent_root, +							&key, path, -1, 1); +			} +  			if (ret) { -				printk(KERN_ERR "umm, got %d back from search" -				       ", was looking for %llu\n", ret, -				       (unsigned long long)bytenr); -				btrfs_print_leaf(extent_root, path->nodes[0]); +				btrfs_err(info, "umm, got %d back from search, was looking for %llu", +					ret, bytenr); +				if (ret > 0) +					btrfs_print_leaf(extent_root, +							 path->nodes[0]); +			} +			if (ret < 0) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out;  			} -			BUG_ON(ret);  			extent_slot = path->slots[0];  		} -	} else { +	} else if (WARN_ON(ret == -ENOENT)) {  		btrfs_print_leaf(extent_root, path->nodes[0]); -		WARN_ON(1); -		printk(KERN_ERR "btrfs unable to find ref byte nr %llu " -		       "parent %llu root %llu  owner %llu offset %llu\n", -		       (unsigned long long)bytenr, -		       (unsigned long long)parent, -		       (unsigned long long)root_objectid, -		       (unsigned long long)owner_objectid, -		       (unsigned long long)owner_offset); +		btrfs_err(info, +			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu", +			bytenr, parent, root_objectid, owner_objectid, +			owner_offset); +		btrfs_abort_transaction(trans, extent_root, ret); +		goto out; +	} else { +		btrfs_abort_transaction(trans, extent_root, ret); +		goto out;  	}  	leaf = path->nodes[0]; @@ -4454,9 +5972,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		BUG_ON(found_extent || extent_slot != path->slots[0]);  		ret = convert_extent_item_v0(trans, extent_root, path,  					     owner_objectid, 0); -		BUG_ON(ret < 0); +		if (ret < 0) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		} -		btrfs_release_path(extent_root, path); +		btrfs_release_path(path);  		path->leave_spinning = 1;  		key.objectid = bytenr; @@ -4466,12 +5987,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		ret = btrfs_search_slot(trans, extent_root, &key, path,  					-1, 1);  		if (ret) { -			printk(KERN_ERR "umm, got %d back from search" -			       ", was looking for %llu\n", ret, -			       (unsigned long long)bytenr); +			btrfs_err(info, "umm, got %d back from search, was looking for %llu", +				ret, bytenr);  			btrfs_print_leaf(extent_root, path->nodes[0]);  		} -		BUG_ON(ret); +		if (ret < 0) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		} +  		extent_slot = path->slots[0];  		leaf = path->nodes[0];  		item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -4480,7 +6004,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	BUG_ON(item_size < sizeof(*ei));  	ei = btrfs_item_ptr(leaf, extent_slot,  			    struct btrfs_extent_item); -	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { +	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && +	    key.type == BTRFS_EXTENT_ITEM_KEY) {  		struct btrfs_tree_block_info *bi;  		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));  		bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -4488,10 +6013,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	}  	refs = btrfs_extent_refs(leaf, ei); -	BUG_ON(refs < refs_to_drop); +	if (refs < refs_to_drop) { +		btrfs_err(info, "trying to drop %d refs but we only have %Lu " +			  "for bytenr %Lu", refs_to_drop, refs, bytenr); +		ret = -EINVAL; +		btrfs_abort_transaction(trans, extent_root, ret); +		goto out; +	}  	refs -= refs_to_drop;  	if (refs > 0) { +		type = BTRFS_QGROUP_OPER_SUB_SHARED;  		if (extent_op)  			__run_delayed_extent_op(extent_op, leaf, ei);  		/* @@ -4507,9 +6039,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  		if (found_extent) {  			ret = remove_extent_backref(trans, extent_root, path,  						    iref, refs_to_drop, -						    is_data); -			BUG_ON(ret); +						    is_data, &last_ref); +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  		} +		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, +				 root_objectid);  	} else {  		if (found_extent) {  			BUG_ON(is_data && refs_to_drop != @@ -4523,23 +6060,44 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  			}  		} +		last_ref = 1;  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],  				      num_to_del); -		BUG_ON(ret); -		btrfs_release_path(extent_root, path); +		if (ret) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		} +		btrfs_release_path(path);  		if (is_data) {  			ret = btrfs_del_csums(trans, root, bytenr, num_bytes); -			BUG_ON(ret); -		} else { -			invalidate_mapping_pages(info->btree_inode->i_mapping, -			     bytenr >> PAGE_CACHE_SHIFT, -			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); +			if (ret) { +				btrfs_abort_transaction(trans, extent_root, ret); +				goto out; +			}  		} -		ret = update_block_group(trans, root, bytenr, num_bytes, 0); -		BUG_ON(ret); +		ret = update_block_group(root, bytenr, num_bytes, 0); +		if (ret) { +			btrfs_abort_transaction(trans, extent_root, ret); +			goto out; +		} +	} +	btrfs_release_path(path); + +	/* Deal with the quota accounting */ +	if (!ret && last_ref && !no_quota) { +		int mod_seq = 0; + +		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && +		    type == BTRFS_QGROUP_OPER_SUB_SHARED) +			mod_seq = 1; + +		ret = btrfs_qgroup_record_ref(trans, info, root_objectid, +					      bytenr, num_bytes, type, +					      mod_seq);  	} +out:  	btrfs_free_path(path);  	return ret;  } @@ -4555,30 +6113,22 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  {  	struct btrfs_delayed_ref_head *head;  	struct btrfs_delayed_ref_root *delayed_refs; -	struct btrfs_delayed_ref_node *ref; -	struct rb_node *node;  	int ret = 0;  	delayed_refs = &trans->transaction->delayed_refs;  	spin_lock(&delayed_refs->lock);  	head = btrfs_find_delayed_ref_head(trans, bytenr);  	if (!head) -		goto out; +		goto out_delayed_unlock; -	node = rb_prev(&head->node.rb_node); -	if (!node) -		goto out; - -	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - -	/* there are still entries for this ref, we can't drop it */ -	if (ref->bytenr == bytenr) +	spin_lock(&head->lock); +	if (rb_first(&head->ref_root))  		goto out;  	if (head->extent_op) {  		if (!head->must_insert_reserved)  			goto out; -		kfree(head->extent_op); +		btrfs_free_delayed_extent_op(head->extent_op);  		head->extent_op = NULL;  	} @@ -4594,19 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	 * ahead and process it.  	 */  	head->node.in_tree = 0; -	rb_erase(&head->node.rb_node, &delayed_refs->root); +	rb_erase(&head->href_node, &delayed_refs->href_root); -	delayed_refs->num_entries--; +	atomic_dec(&delayed_refs->num_entries);  	/*  	 * we don't take a ref on the node because we're removing it from the  	 * tree, so we just steal the ref the tree was holding.  	 */  	delayed_refs->num_heads--; -	if (list_empty(&head->cluster)) +	if (head->processing == 0)  		delayed_refs->num_heads_ready--; - -	list_del_init(&head->cluster); +	head->processing = 0; +	spin_unlock(&head->lock);  	spin_unlock(&delayed_refs->lock);  	BUG_ON(head->extent_op); @@ -4617,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,  	btrfs_put_delayed_ref(&head->node);  	return ret;  out: +	spin_unlock(&head->lock); + +out_delayed_unlock:  	spin_unlock(&delayed_refs->lock);  	return 0;  } @@ -4626,89 +6179,70 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  			   struct extent_buffer *buf,  			   u64 parent, int last_ref)  { -	struct btrfs_block_rsv *block_rsv;  	struct btrfs_block_group_cache *cache = NULL; +	int pin = 1;  	int ret;  	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { -		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, -						parent, root->root_key.objectid, -						btrfs_header_level(buf), -						BTRFS_DROP_DELAYED_REF, NULL); -		BUG_ON(ret); +		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, +					buf->start, buf->len, +					parent, root->root_key.objectid, +					btrfs_header_level(buf), +					BTRFS_DROP_DELAYED_REF, NULL, 0); +		BUG_ON(ret); /* -ENOMEM */  	}  	if (!last_ref)  		return; -	block_rsv = get_block_rsv(trans, root);  	cache = btrfs_lookup_block_group(root->fs_info, buf->start); -	if (block_rsv->space_info != cache->space_info) -		goto out;  	if (btrfs_header_generation(buf) == trans->transid) {  		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {  			ret = check_ref_cleanup(trans, root, buf->start);  			if (!ret) -				goto pin; +				goto out;  		}  		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {  			pin_down_extent(root, cache, buf->start, buf->len, 1); -			goto pin; +			goto out;  		}  		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));  		btrfs_add_free_space(cache, buf->start, buf->len); -		ret = update_reserved_bytes(cache, buf->len, 0, 0); -		if (ret == -EAGAIN) { -			/* block group became read-only */ -			update_reserved_bytes(cache, buf->len, 0, 1); -			goto out; -		} - -		ret = 1; -		spin_lock(&block_rsv->lock); -		if (block_rsv->reserved < block_rsv->size) { -			block_rsv->reserved += buf->len; -			ret = 0; -		} -		spin_unlock(&block_rsv->lock); - -		if (ret) { -			spin_lock(&cache->space_info->lock); -			cache->space_info->bytes_reserved -= buf->len; -			spin_unlock(&cache->space_info->lock); -		} -		goto out; -	} -pin: -	if (block_rsv->durable && !cache->ro) { -		ret = 0; -		spin_lock(&cache->lock); -		if (!cache->ro) { -			cache->reserved_pinned += buf->len; -			ret = 1; -		} -		spin_unlock(&cache->lock); - -		if (ret) { -			spin_lock(&block_rsv->lock); -			block_rsv->freed[trans->transid & 0x1] += buf->len; -			spin_unlock(&block_rsv->lock); -		} +		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); +		trace_btrfs_reserved_extent_free(root, buf->start, buf->len); +		pin = 0;  	}  out: +	if (pin) +		add_pinned_bytes(root->fs_info, buf->len, +				 btrfs_header_level(buf), +				 root->root_key.objectid); + +	/* +	 * Deleting the buffer, clear the corrupt flag since it doesn't matter +	 * anymore. +	 */ +	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);  	btrfs_put_block_group(cache);  } -int btrfs_free_extent(struct btrfs_trans_handle *trans, -		      struct btrfs_root *root, -		      u64 bytenr, u64 num_bytes, u64 parent, -		      u64 root_objectid, u64 owner, u64 offset) +/* Can return -ENOMEM */ +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, +		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, +		      u64 owner, u64 offset, int no_quota)  {  	int ret; +	struct btrfs_fs_info *fs_info = root->fs_info; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) +		return 0; +#endif +	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);  	/*  	 * tree log blocks never actually go into the extent allocation @@ -4720,23 +6254,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,  		btrfs_pin_extent(root, bytenr, num_bytes, 1);  		ret = 0;  	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) { -		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, +		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, +					num_bytes,  					parent, root_objectid, (int)owner, -					BTRFS_DROP_DELAYED_REF, NULL); -		BUG_ON(ret); +					BTRFS_DROP_DELAYED_REF, NULL, no_quota);  	} else { -		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, -					parent, root_objectid, owner, -					offset, BTRFS_DROP_DELAYED_REF, NULL); -		BUG_ON(ret); +		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, +						num_bytes, +						parent, root_objectid, owner, +						offset, BTRFS_DROP_DELAYED_REF, +						NULL, no_quota);  	}  	return ret;  } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, +			struct btrfs_block_group_cache *cache, +			u64 val, u64 num_bytes)  { -	u64 mask = ((u64)root->stripesize - 1); -	u64 ret = (val + mask) & ~mask; +	u64 ret = ALIGN(val, root->stripesize);  	return ret;  } @@ -4750,106 +6286,198 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)   * for our min num_bytes.  Another option is to have it go ahead   * and look in the rbtree for a free extent of a given size, but this   * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group.   */ -static noinline int +static noinline void  wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,  				u64 num_bytes)  {  	struct btrfs_caching_control *caching_ctl; -	DEFINE_WAIT(wait);  	caching_ctl = get_caching_control(cache);  	if (!caching_ctl) -		return 0; +		return;  	wait_event(caching_ctl->wait, block_group_cache_done(cache) || -		   (cache->free_space >= num_bytes)); +		   (cache->free_space_ctl->free_space >= num_bytes));  	put_caching_control(caching_ctl); -	return 0;  }  static noinline int  wait_block_group_cache_done(struct btrfs_block_group_cache *cache)  {  	struct btrfs_caching_control *caching_ctl; -	DEFINE_WAIT(wait); +	int ret = 0;  	caching_ctl = get_caching_control(cache);  	if (!caching_ctl) -		return 0; +		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;  	wait_event(caching_ctl->wait, block_group_cache_done(cache)); - +	if (cache->cached == BTRFS_CACHE_ERROR) +		ret = -EIO;  	put_caching_control(caching_ctl); -	return 0; +	return ret;  } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int __get_raid_index(u64 flags)  { -	int index; -	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) -		index = 0; -	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) -		index = 1; -	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) -		index = 2; -	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) -		index = 3; -	else -		index = 4; -	return index; +	if (flags & BTRFS_BLOCK_GROUP_RAID10) +		return BTRFS_RAID_RAID10; +	else if (flags & BTRFS_BLOCK_GROUP_RAID1) +		return BTRFS_RAID_RAID1; +	else if (flags & BTRFS_BLOCK_GROUP_DUP) +		return BTRFS_RAID_DUP; +	else if (flags & BTRFS_BLOCK_GROUP_RAID0) +		return BTRFS_RAID_RAID0; +	else if (flags & BTRFS_BLOCK_GROUP_RAID5) +		return BTRFS_RAID_RAID5; +	else if (flags & BTRFS_BLOCK_GROUP_RAID6) +		return BTRFS_RAID_RAID6; + +	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + +int get_block_group_index(struct btrfs_block_group_cache *cache) +{ +	return __get_raid_index(cache->flags); +} + +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { +	[BTRFS_RAID_RAID10]	= "raid10", +	[BTRFS_RAID_RAID1]	= "raid1", +	[BTRFS_RAID_DUP]	= "dup", +	[BTRFS_RAID_RAID0]	= "raid0", +	[BTRFS_RAID_SINGLE]	= "single", +	[BTRFS_RAID_RAID5]	= "raid5", +	[BTRFS_RAID_RAID6]	= "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ +	if (type >= BTRFS_NR_RAID_TYPES) +		return NULL; + +	return btrfs_raid_type_names[type];  }  enum btrfs_loop_type { -	LOOP_FIND_IDEAL = 0, -	LOOP_CACHING_NOWAIT = 1, -	LOOP_CACHING_WAIT = 2, -	LOOP_ALLOC_CHUNK = 3, -	LOOP_NO_EMPTY_SIZE = 4, +	LOOP_CACHING_NOWAIT = 0, +	LOOP_CACHING_WAIT = 1, +	LOOP_ALLOC_CHUNK = 2, +	LOOP_NO_EMPTY_SIZE = 3,  }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, +		       int delalloc) +{ +	if (delalloc) +		down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, +		       int delalloc) +{ +	btrfs_get_block_group(cache); +	if (delalloc) +		down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, +		   struct btrfs_free_cluster *cluster, +		   int delalloc) +{ +	struct btrfs_block_group_cache *used_bg; +	bool locked = false; +again: +	spin_lock(&cluster->refill_lock); +	if (locked) { +		if (used_bg == cluster->block_group) +			return used_bg; + +		up_read(&used_bg->data_rwsem); +		btrfs_put_block_group(used_bg); +	} + +	used_bg = cluster->block_group; +	if (!used_bg) +		return NULL; + +	if (used_bg == block_group) +		return used_bg; + +	btrfs_get_block_group(used_bg); + +	if (!delalloc) +		return used_bg; + +	if (down_read_trylock(&used_bg->data_rwsem)) +		return used_bg; + +	spin_unlock(&cluster->refill_lock); +	down_read(&used_bg->data_rwsem); +	locked = true; +	goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, +			 int delalloc) +{ +	if (delalloc) +		up_read(&cache->data_rwsem); +	btrfs_put_block_group(cache); +} +  /*   * walks the btree of allocated extents and find a hole of a given size.   * The key ins is changed to record the hole: - * ins->objectid == block start + * ins->objectid == start position   * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks + * ins->offset == the size of the hole.   * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently.   */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, -				     struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root,  				     u64 num_bytes, u64 empty_size, -				     u64 search_start, u64 search_end,  				     u64 hint_byte, struct btrfs_key *ins, -				     int data) +				     u64 flags, int delalloc)  {  	int ret = 0;  	struct btrfs_root *root = orig_root->fs_info->extent_root;  	struct btrfs_free_cluster *last_ptr = NULL;  	struct btrfs_block_group_cache *block_group = NULL; +	u64 search_start = 0; +	u64 max_extent_size = 0;  	int empty_cluster = 2 * 1024 * 1024; -	int allowed_chunk_alloc = 0; -	int done_chunk_alloc = 0;  	struct btrfs_space_info *space_info; -	int last_ptr_loop = 0;  	int loop = 0; -	int index = 0; -	bool found_uncached_bg = false; +	int index = __get_raid_index(flags); +	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? +		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;  	bool failed_cluster_refill = false;  	bool failed_alloc = false;  	bool use_cluster = true; -	u64 ideal_cache_percent = 0; -	u64 ideal_cache_offset = 0; +	bool have_caching_bg = false;  	WARN_ON(num_bytes < root->sectorsize);  	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);  	ins->objectid = 0;  	ins->offset = 0; -	space_info = __find_space_info(root->fs_info, data); +	trace_find_free_extent(orig_root, num_bytes, empty_size, flags); + +	space_info = __find_space_info(root->fs_info, flags);  	if (!space_info) { -		printk(KERN_ERR "No space info for %d\n", data); +		btrfs_err(root->fs_info, "No space info for %llu", flags);  		return -ENOSPC;  	} @@ -4860,16 +6488,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	if (btrfs_mixed_space_info(space_info))  		use_cluster = false; -	if (orig_root->ref_cows || empty_size) -		allowed_chunk_alloc = 1; - -	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { +	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {  		last_ptr = &root->fs_info->meta_alloc_cluster;  		if (!btrfs_test_opt(root, SSD))  			empty_cluster = 64 * 1024;  	} -	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && +	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&  	    btrfs_test_opt(root, SSD)) {  		last_ptr = &root->fs_info->data_alloc_cluster;  	} @@ -4888,7 +6513,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  		empty_cluster = 0;  	if (search_start == hint_byte) { -ideal_cache:  		block_group = btrfs_lookup_block_group(root->fs_info,  						       search_start);  		/* @@ -4898,9 +6522,8 @@ ideal_cache:  		 * However if we are re-searching with an ideal block group  		 * picked out then we don't care that the block group is cached.  		 */ -		if (block_group && block_group_bits(block_group, data) && -		    (block_group->cached != BTRFS_CACHE_NO || -		     search_start == ideal_cache_offset)) { +		if (block_group && block_group_bits(block_group, flags) && +		    block_group->cached != BTRFS_CACHE_NO) {  			down_read(&space_info->groups_sem);  			if (list_empty(&block_group->list) ||  			    block_group->ro) { @@ -4914,6 +6537,7 @@ ideal_cache:  				up_read(&space_info->groups_sem);  			} else {  				index = get_block_group_index(block_group); +				btrfs_lock_block_group(block_group, delalloc);  				goto have_block_group;  			}  		} else if (block_group) { @@ -4921,145 +6545,156 @@ ideal_cache:  		}  	}  search: +	have_caching_bg = false;  	down_read(&space_info->groups_sem);  	list_for_each_entry(block_group, &space_info->block_groups[index],  			    list) {  		u64 offset;  		int cached; -		btrfs_get_block_group(block_group); +		btrfs_grab_block_group(block_group, delalloc);  		search_start = block_group->key.objectid; -have_block_group: -		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { -			u64 free_percent; - -			ret = cache_block_group(block_group, trans, 1); -			if (block_group->cached == BTRFS_CACHE_FINISHED) -				goto have_block_group; - -			free_percent = btrfs_block_group_used(&block_group->item); -			free_percent *= 100; -			free_percent = div64_u64(free_percent, -						 block_group->key.offset); -			free_percent = 100 - free_percent; -			if (free_percent > ideal_cache_percent && -			    likely(!block_group->ro)) { -				ideal_cache_offset = block_group->key.objectid; -				ideal_cache_percent = free_percent; -			} - -			/* -			 * We only want to start kthread caching if we are at -			 * the point where we will wait for caching to make -			 * progress, or if our ideal search is over and we've -			 * found somebody to start caching. -			 */ -			if (loop > LOOP_CACHING_NOWAIT || -			    (loop > LOOP_FIND_IDEAL && -			     atomic_read(&space_info->caching_threads) < 2)) { -				ret = cache_block_group(block_group, trans, 0); -				BUG_ON(ret); -			} -			found_uncached_bg = true; +		/* +		 * this can happen if we end up cycling through all the +		 * raid types, but we want to make sure we only allocate +		 * for the proper type. +		 */ +		if (!block_group_bits(block_group, flags)) { +		    u64 extra = BTRFS_BLOCK_GROUP_DUP | +				BTRFS_BLOCK_GROUP_RAID1 | +				BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6 | +				BTRFS_BLOCK_GROUP_RAID10;  			/* -			 * If loop is set for cached only, try the next block -			 * group. +			 * if they asked for extra copies and this block group +			 * doesn't provide them, bail.  This does allow us to +			 * fill raid0 from raid1.  			 */ -			if (loop == LOOP_FIND_IDEAL) +			if ((flags & extra) && !(block_group->flags & extra))  				goto loop;  		} +have_block_group:  		cached = block_group_cache_done(block_group); -		if (unlikely(!cached)) -			found_uncached_bg = true; +		if (unlikely(!cached)) { +			ret = cache_block_group(block_group, 0); +			BUG_ON(ret < 0); +			ret = 0; +		} +		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) +			goto loop;  		if (unlikely(block_group->ro))  			goto loop;  		/* -		 * Ok we want to try and use the cluster allocator, so lets look -		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will -		 * have tried the cluster allocator plenty of times at this -		 * point and not have found anything, so we are likely way too -		 * fragmented for the clustering stuff to find anything, so lets -		 * just skip it and let the allocator find whatever block it can -		 * find +		 * Ok we want to try and use the cluster allocator, so +		 * lets look there  		 */ -		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { +		if (last_ptr) { +			struct btrfs_block_group_cache *used_block_group; +			unsigned long aligned_cluster;  			/*  			 * the refill lock keeps out other  			 * people trying to start a new cluster  			 */ -			spin_lock(&last_ptr->refill_lock); -			if (last_ptr->block_group && -			    (last_ptr->block_group->ro || -			    !block_group_bits(last_ptr->block_group, data))) { -				offset = 0; +			used_block_group = btrfs_lock_cluster(block_group, +							      last_ptr, +							      delalloc); +			if (!used_block_group)  				goto refill_cluster; -			} -			offset = btrfs_alloc_from_cluster(block_group, last_ptr, -						 num_bytes, search_start); +			if (used_block_group != block_group && +			    (used_block_group->ro || +			     !block_group_bits(used_block_group, flags))) +				goto release_cluster; + +			offset = btrfs_alloc_from_cluster(used_block_group, +						last_ptr, +						num_bytes, +						used_block_group->key.objectid, +						&max_extent_size);  			if (offset) {  				/* we have a block, we're done */  				spin_unlock(&last_ptr->refill_lock); +				trace_btrfs_reserve_extent_cluster(root, +						used_block_group, +						search_start, num_bytes); +				if (used_block_group != block_group) { +					btrfs_release_block_group(block_group, +								  delalloc); +					block_group = used_block_group; +				}  				goto checks;  			} -			spin_lock(&last_ptr->lock); -			/* -			 * whoops, this cluster doesn't actually point to -			 * this block group.  Get a ref on the block -			 * group is does point to and try again -			 */ -			if (!last_ptr_loop && last_ptr->block_group && -			    last_ptr->block_group != block_group) { - -				btrfs_put_block_group(block_group); -				block_group = last_ptr->block_group; -				btrfs_get_block_group(block_group); -				spin_unlock(&last_ptr->lock); +			WARN_ON(last_ptr->block_group != used_block_group); +release_cluster: +			/* If we are on LOOP_NO_EMPTY_SIZE, we can't +			 * set up a new clusters, so lets just skip it +			 * and let the allocator find whatever block +			 * it can find.  If we reach this point, we +			 * will have tried the cluster allocator +			 * plenty of times and not have found +			 * anything, so we are likely way too +			 * fragmented for the clustering stuff to find +			 * anything. +			 * +			 * However, if the cluster is taken from the +			 * current block group, release the cluster +			 * first, so that we stand a better chance of +			 * succeeding in the unclustered +			 * allocation.  */ +			if (loop >= LOOP_NO_EMPTY_SIZE && +			    used_block_group != block_group) {  				spin_unlock(&last_ptr->refill_lock); - -				last_ptr_loop = 1; -				search_start = block_group->key.objectid; -				/* -				 * we know this block group is properly -				 * in the list because -				 * btrfs_remove_block_group, drops the -				 * cluster before it removes the block -				 * group from the list -				 */ -				goto have_block_group; +				btrfs_release_block_group(used_block_group, +							  delalloc); +				goto unclustered_alloc;  			} -			spin_unlock(&last_ptr->lock); -refill_cluster: +  			/*  			 * this cluster didn't work out, free it and  			 * start over  			 */  			btrfs_return_cluster_to_free_space(NULL, last_ptr); -			last_ptr_loop = 0; +			if (used_block_group != block_group) +				btrfs_release_block_group(used_block_group, +							  delalloc); +refill_cluster: +			if (loop >= LOOP_NO_EMPTY_SIZE) { +				spin_unlock(&last_ptr->refill_lock); +				goto unclustered_alloc; +			} + +			aligned_cluster = max_t(unsigned long, +						empty_cluster + empty_size, +					      block_group->full_stripe_len);  			/* allocate a cluster in this block group */ -			ret = btrfs_find_space_cluster(trans, root, -					       block_group, last_ptr, -					       offset, num_bytes, -					       empty_cluster + empty_size); +			ret = btrfs_find_space_cluster(root, block_group, +						       last_ptr, search_start, +						       num_bytes, +						       aligned_cluster);  			if (ret == 0) {  				/*  				 * now pull our allocation out of this  				 * cluster  				 */  				offset = btrfs_alloc_from_cluster(block_group, -						  last_ptr, num_bytes, -						  search_start); +							last_ptr, +							num_bytes, +							search_start, +							&max_extent_size);  				if (offset) {  					/* we found one, proceed */  					spin_unlock(&last_ptr->refill_lock); +					trace_btrfs_reserve_extent_cluster(root, +						block_group, search_start, +						num_bytes);  					goto checks;  				}  			} else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5083,8 +6718,23 @@ refill_cluster:  			goto loop;  		} +unclustered_alloc: +		spin_lock(&block_group->free_space_ctl->tree_lock); +		if (cached && +		    block_group->free_space_ctl->free_space < +		    num_bytes + empty_cluster + empty_size) { +			if (block_group->free_space_ctl->free_space > +			    max_extent_size) +				max_extent_size = +					block_group->free_space_ctl->free_space; +			spin_unlock(&block_group->free_space_ctl->tree_lock); +			goto loop; +		} +		spin_unlock(&block_group->free_space_ctl->tree_lock); +  		offset = btrfs_find_space_for_alloc(block_group, search_start, -						    num_bytes, empty_size); +						    num_bytes, empty_size, +						    &max_extent_size);  		/*  		 * If we didn't find a chunk, and we haven't failed on this  		 * block group before, and this block group is in the middle of @@ -5101,15 +6751,13 @@ refill_cluster:  			failed_alloc = true;  			goto have_block_group;  		} else if (!offset) { +			if (!cached) +				have_caching_bg = true;  			goto loop;  		}  checks: -		search_start = stripe_align(root, offset); -		/* move on to the next group */ -		if (search_start + num_bytes >= search_end) { -			btrfs_add_free_space(block_group, offset, num_bytes); -			goto loop; -		} +		search_start = stripe_align(root, block_group, +					    offset, num_bytes);  		/* move on to the next group */  		if (search_start + num_bytes > @@ -5118,16 +6766,13 @@ checks:  			goto loop;  		} -		ins->objectid = search_start; -		ins->offset = num_bytes; -  		if (offset < search_start)  			btrfs_add_free_space(block_group, offset,  					     search_start - offset);  		BUG_ON(offset > search_start); -		ret = update_reserved_bytes(block_group, num_bytes, 1, -					    (data & BTRFS_BLOCK_GROUP_DATA)); +		ret = btrfs_update_reserved_bytes(block_group, num_bytes, +						  alloc_type, delalloc);  		if (ret == -EAGAIN) {  			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop; @@ -5137,25 +6782,25 @@ checks:  		ins->objectid = search_start;  		ins->offset = num_bytes; -		if (offset < search_start) -			btrfs_add_free_space(block_group, offset, -					     search_start - offset); -		BUG_ON(offset > search_start); +		trace_btrfs_reserve_extent(orig_root, block_group, +					   search_start, num_bytes); +		btrfs_release_block_group(block_group, delalloc);  		break;  loop:  		failed_cluster_refill = false;  		failed_alloc = false;  		BUG_ON(index != get_block_group_index(block_group)); -		btrfs_put_block_group(block_group); +		btrfs_release_block_group(block_group, delalloc);  	}  	up_read(&space_info->groups_sem); +	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) +		goto search; +  	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)  		goto search; -	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for -	 *			for them to make caching progress.  Also -	 *			determine the best possible bg to cache +	/*  	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking  	 *			caching kthreads as we move along  	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5163,85 +6808,55 @@ loop:  	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try  	 *			again  	 */ -	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && -	    (found_uncached_bg || empty_size || empty_cluster || -	     allowed_chunk_alloc)) { +	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {  		index = 0; -		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { -			found_uncached_bg = false; -			loop++; -			if (!ideal_cache_percent && -			    atomic_read(&space_info->caching_threads)) -				goto search; +		loop++; +		if (loop == LOOP_ALLOC_CHUNK) { +			struct btrfs_trans_handle *trans; +			int exist = 0; +			trans = current->journal_info; +			if (trans) +				exist = 1; +			else +				trans = btrfs_join_transaction(root); + +			if (IS_ERR(trans)) { +				ret = PTR_ERR(trans); +				goto out; +			} + +			ret = do_chunk_alloc(trans, root, flags, +					     CHUNK_ALLOC_FORCE);  			/* -			 * 1 of the following 2 things have happened so far -			 * -			 * 1) We found an ideal block group for caching that -			 * is mostly full and will cache quickly, so we might -			 * as well wait for it. -			 * -			 * 2) We searched for cached only and we didn't find -			 * anything, and we didn't start any caching kthreads -			 * either, so chances are we will loop through and -			 * start a couple caching kthreads, and then come back -			 * around and just wait for them.  This will be slower -			 * because we will have 2 caching kthreads reading at -			 * the same time when we could have just started one -			 * and waited for it to get far enough to give us an -			 * allocation, so go ahead and go to the wait caching -			 * loop. -			 */ -			loop = LOOP_CACHING_WAIT; -			search_start = ideal_cache_offset; -			ideal_cache_percent = 0; -			goto ideal_cache; -		} else if (loop == LOOP_FIND_IDEAL) { -			/* -			 * Didn't find a uncached bg, wait on anything we find -			 * next. +			 * Do not bail out on ENOSPC since we +			 * can do more things.  			 */ -			loop = LOOP_CACHING_WAIT; -			goto search; -		} - -		if (loop < LOOP_CACHING_WAIT) { -			loop++; -			goto search; +			if (ret < 0 && ret != -ENOSPC) +				btrfs_abort_transaction(trans, +							root, ret); +			else +				ret = 0; +			if (!exist) +				btrfs_end_transaction(trans, root); +			if (ret) +				goto out;  		} -		if (loop == LOOP_ALLOC_CHUNK) { +		if (loop == LOOP_NO_EMPTY_SIZE) {  			empty_size = 0;  			empty_cluster = 0;  		} -		if (allowed_chunk_alloc) { -			ret = do_chunk_alloc(trans, root, num_bytes + -					     2 * 1024 * 1024, data, 1); -			allowed_chunk_alloc = 0; -			done_chunk_alloc = 1; -		} else if (!done_chunk_alloc) { -			space_info->force_alloc = 1; -		} - -		if (loop < LOOP_NO_EMPTY_SIZE) { -			loop++; -			goto search; -		} -		ret = -ENOSPC; +		goto search;  	} else if (!ins->objectid) {  		ret = -ENOSPC; -	} - -	/* we found what we needed */ -	if (ins->objectid) { -		if (!(data & BTRFS_BLOCK_GROUP_DATA)) -			trans->block_group = block_group->key.objectid; - -		btrfs_put_block_group(block_group); +	} else if (ins->objectid) {  		ret = 0;  	} - +out: +	if (ret == -ENOSPC) +		ins->offset = max_extent_size;  	return ret;  } @@ -5252,19 +6867,16 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  	int index = 0;  	spin_lock(&info->lock); -	printk(KERN_INFO "space_info has %llu free, is %sfull\n", -	       (unsigned long long)(info->total_bytes - info->bytes_used - -				    info->bytes_pinned - info->bytes_reserved - -				    info->bytes_readonly), +	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", +	       info->flags, +	       info->total_bytes - info->bytes_used - info->bytes_pinned - +	       info->bytes_reserved - info->bytes_readonly,  	       (info->full) ? "" : "not "); -	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " +	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "  	       "reserved=%llu, may_use=%llu, readonly=%llu\n", -	       (unsigned long long)info->total_bytes, -	       (unsigned long long)info->bytes_used, -	       (unsigned long long)info->bytes_pinned, -	       (unsigned long long)info->bytes_reserved, -	       (unsigned long long)info->bytes_may_use, -	       (unsigned long long)info->bytes_readonly); +	       info->total_bytes, info->bytes_used, info->bytes_pinned, +	       info->bytes_reserved, info->bytes_may_use, +	       info->bytes_readonly);  	spin_unlock(&info->lock);  	if (!dump_block_groups) @@ -5274,13 +6886,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  again:  	list_for_each_entry(cache, &info->block_groups[index], list) {  		spin_lock(&cache->lock); -		printk(KERN_INFO "block group %llu has %llu bytes, %llu used " -		       "%llu pinned %llu reserved\n", -		       (unsigned long long)cache->key.objectid, -		       (unsigned long long)cache->key.offset, -		       (unsigned long long)btrfs_block_group_used(&cache->item), -		       (unsigned long long)cache->pinned, -		       (unsigned long long)cache->reserved); +		printk(KERN_INFO "BTRFS: " +			   "block group %llu has %llu bytes, " +			   "%llu used %llu pinned %llu reserved %s\n", +		       cache->key.objectid, cache->key.offset, +		       btrfs_block_group_used(&cache->item), cache->pinned, +		       cache->reserved, cache->ro ? "[readonly]" : "");  		btrfs_dump_free_space(cache, bytes);  		spin_unlock(&cache->lock);  	} @@ -5289,73 +6900,85 @@ again:  	up_read(&info->groups_sem);  } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, -			 struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root,  			 u64 num_bytes, u64 min_alloc_size,  			 u64 empty_size, u64 hint_byte, -			 u64 search_end, struct btrfs_key *ins, -			 u64 data) +			 struct btrfs_key *ins, int is_data, int delalloc)  { +	bool final_tried = false; +	u64 flags;  	int ret; -	u64 search_start = 0; -	data = btrfs_get_alloc_profile(root, data); +	flags = btrfs_get_alloc_profile(root, is_data);  again: -	/* -	 * the only place that sets empty_size is btrfs_realloc_node, which -	 * is not called recursively on allocations -	 */ -	if (empty_size || root->ref_cows) -		ret = do_chunk_alloc(trans, root->fs_info->extent_root, -				     num_bytes + 2 * 1024 * 1024, data, 0); -  	WARN_ON(num_bytes < root->sectorsize); -	ret = find_free_extent(trans, root, num_bytes, empty_size, -			       search_start, search_end, hint_byte, -			       ins, data); - -	if (ret == -ENOSPC && num_bytes > min_alloc_size) { -		num_bytes = num_bytes >> 1; -		num_bytes = num_bytes & ~(root->sectorsize - 1); -		num_bytes = max(num_bytes, min_alloc_size); -		do_chunk_alloc(trans, root->fs_info->extent_root, -			       num_bytes, data, 1); -		goto again; -	} +	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, +			       flags, delalloc); +  	if (ret == -ENOSPC) { -		struct btrfs_space_info *sinfo; +		if (!final_tried && ins->offset) { +			num_bytes = min(num_bytes >> 1, ins->offset); +			num_bytes = round_down(num_bytes, root->sectorsize); +			num_bytes = max(num_bytes, min_alloc_size); +			if (num_bytes == min_alloc_size) +				final_tried = true; +			goto again; +		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { +			struct btrfs_space_info *sinfo; -		sinfo = __find_space_info(root->fs_info, data); -		printk(KERN_ERR "btrfs allocation failed flags %llu, " -		       "wanted %llu\n", (unsigned long long)data, -		       (unsigned long long)num_bytes); -		dump_space_info(sinfo, num_bytes, 1); +			sinfo = __find_space_info(root->fs_info, flags); +			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", +				flags, num_bytes); +			if (sinfo) +				dump_space_info(sinfo, num_bytes, 1); +		}  	}  	return ret;  } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, +					u64 start, u64 len, +					int pin, int delalloc)  {  	struct btrfs_block_group_cache *cache;  	int ret = 0;  	cache = btrfs_lookup_block_group(root->fs_info, start);  	if (!cache) { -		printk(KERN_ERR "Unable to find block group for %llu\n", -		       (unsigned long long)start); +		btrfs_err(root->fs_info, "Unable to find block group for %llu", +			start);  		return -ENOSPC;  	} -	ret = btrfs_discard_extent(root, start, len); +	if (btrfs_test_opt(root, DISCARD)) +		ret = btrfs_discard_extent(root, start, len, NULL); -	btrfs_add_free_space(cache, start, len); -	update_reserved_bytes(cache, len, 0, 1); +	if (pin) +		pin_down_extent(root, cache, start, len, 1); +	else { +		btrfs_add_free_space(cache, start, len); +		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); +	}  	btrfs_put_block_group(cache); +	trace_btrfs_reserved_extent_free(root, start, len); +  	return ret;  } +int btrfs_free_reserved_extent(struct btrfs_root *root, +			       u64 start, u64 len, int delalloc) +{ +	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, +				       u64 start, u64 len) +{ +	return __btrfs_free_reserved_extent(root, start, len, 1, 0); +} +  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  				      struct btrfs_root *root,  				      u64 parent, u64 root_objectid, @@ -5379,12 +7002,16 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM;  	path->leave_spinning = 1;  	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,  				      ins, size); -	BUG_ON(ret); +	if (ret) { +		btrfs_free_path(path); +		return ret; +	}  	leaf = path->nodes[0];  	extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5413,13 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); -	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); -	if (ret) { -		printk(KERN_ERR "btrfs update block group failed for %llu " -		       "%llu\n", (unsigned long long)ins->objectid, -		       (unsigned long long)ins->offset); +	/* Always set parent to 0 here since its exclusive anyway. */ +	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +				      ins->objectid, ins->offset, +				      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +	if (ret) +		return ret; + +	ret = update_block_group(root, ins->objectid, ins->offset, 1); +	if (ret) { /* -ENOENT, logic error */ +		btrfs_err(fs_info, "update block group failed for %llu %llu", +			ins->objectid, ins->offset);  		BUG();  	} +	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);  	return ret;  } @@ -5427,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     u64 parent, u64 root_objectid,  				     u64 flags, struct btrfs_disk_key *key, -				     int level, struct btrfs_key *ins) +				     int level, struct btrfs_key *ins, +				     int no_quota)  {  	int ret;  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -5436,15 +7071,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	struct btrfs_extent_inline_ref *iref;  	struct btrfs_path *path;  	struct extent_buffer *leaf; -	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); +	u32 size = sizeof(*extent_item) + sizeof(*iref); +	u64 num_bytes = ins->offset; +	bool skinny_metadata = btrfs_fs_incompat(root->fs_info, +						 SKINNY_METADATA); + +	if (!skinny_metadata) +		size += sizeof(*block_info);  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) { +		btrfs_free_and_pin_reserved_extent(root, ins->objectid, +						   root->leafsize); +		return -ENOMEM; +	}  	path->leave_spinning = 1;  	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,  				      ins, size); -	BUG_ON(ret); +	if (ret) { +		btrfs_free_and_pin_reserved_extent(root, ins->objectid, +						   root->leafsize); +		btrfs_free_path(path); +		return ret; +	}  	leaf = path->nodes[0];  	extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5453,12 +7103,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	btrfs_set_extent_generation(leaf, extent_item, trans->transid);  	btrfs_set_extent_flags(leaf, extent_item,  			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); -	block_info = (struct btrfs_tree_block_info *)(extent_item + 1); -	btrfs_set_tree_block_key(leaf, block_info, key); -	btrfs_set_tree_block_level(leaf, block_info, level); +	if (skinny_metadata) { +		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); +		num_bytes = root->leafsize; +	} else { +		block_info = (struct btrfs_tree_block_info *)(extent_item + 1); +		btrfs_set_tree_block_key(leaf, block_info, key); +		btrfs_set_tree_block_level(leaf, block_info, level); +		iref = (struct btrfs_extent_inline_ref *)(block_info + 1); +	} -	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);  	if (parent > 0) {  		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));  		btrfs_set_extent_inline_ref_type(leaf, iref, @@ -5473,13 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_free_path(path); -	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); -	if (ret) { -		printk(KERN_ERR "btrfs update block group failed for %llu " -		       "%llu\n", (unsigned long long)ins->objectid, -		       (unsigned long long)ins->offset); +	if (!no_quota) { +		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, +					      ins->objectid, num_bytes, +					      BTRFS_QGROUP_OPER_ADD_EXCL, 0); +		if (ret) +			return ret; +	} + +	ret = update_block_group(root, ins->objectid, root->leafsize, 1); +	if (ret) { /* -ENOENT, logic error */ +		btrfs_err(fs_info, "update block group failed for %llu %llu", +			ins->objectid, ins->offset);  		BUG();  	} + +	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);  	return ret;  } @@ -5492,9 +7156,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,  	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); -	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, -					 0, root_objectid, owner, offset, -					 BTRFS_ADD_DELAYED_EXTENT, NULL); +	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, +					 ins->offset, 0, +					 root_objectid, owner, offset, +					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);  	return ret;  } @@ -5510,57 +7175,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  {  	int ret;  	struct btrfs_block_group_cache *block_group; -	struct btrfs_caching_control *caching_ctl; -	u64 start = ins->objectid; -	u64 num_bytes = ins->offset; - -	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); -	cache_block_group(block_group, trans, 0); -	caching_ctl = get_caching_control(block_group); -	if (!caching_ctl) { -		BUG_ON(!block_group_cache_done(block_group)); -		ret = btrfs_remove_free_space(block_group, start, num_bytes); -		BUG_ON(ret); -	} else { -		mutex_lock(&caching_ctl->mutex); - -		if (start >= caching_ctl->progress) { -			ret = add_excluded_extent(root, start, num_bytes); -			BUG_ON(ret); -		} else if (start + num_bytes <= caching_ctl->progress) { -			ret = btrfs_remove_free_space(block_group, -						      start, num_bytes); -			BUG_ON(ret); -		} else { -			num_bytes = caching_ctl->progress - start; -			ret = btrfs_remove_free_space(block_group, -						      start, num_bytes); -			BUG_ON(ret); - -			start = caching_ctl->progress; -			num_bytes = ins->objectid + ins->offset - -				    caching_ctl->progress; -			ret = add_excluded_extent(root, start, num_bytes); -			BUG_ON(ret); -		} - -		mutex_unlock(&caching_ctl->mutex); -		put_caching_control(caching_ctl); +	/* +	 * Mixed block groups will exclude before processing the log so we only +	 * need to do the exlude dance if this fs isn't mixed. +	 */ +	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { +		ret = __exclude_logged_extent(root, ins->objectid, ins->offset); +		if (ret) +			return ret;  	} -	ret = update_reserved_bytes(block_group, ins->offset, 1, 1); -	BUG_ON(ret); -	btrfs_put_block_group(block_group); +	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); +	if (!block_group) +		return -EINVAL; + +	ret = btrfs_update_reserved_bytes(block_group, ins->offset, +					  RESERVE_ALLOC_NO_ACCOUNT, 0); +	BUG_ON(ret); /* logic error */  	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,  					 0, owner, offset, ins, 1); +	btrfs_put_block_group(block_group);  	return ret;  } -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, -					    struct btrfs_root *root, -					    u64 bytenr, u32 blocksize, -					    int level) +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, +		      u64 bytenr, u32 blocksize, int level)  {  	struct extent_buffer *buf; @@ -5568,9 +7209,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,  	if (!buf)  		return ERR_PTR(-ENOMEM);  	btrfs_set_header_generation(buf, trans->transid); -	btrfs_set_buffer_lockdep_class(buf, level); +	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);  	btrfs_tree_lock(buf);  	clean_tree_block(trans, root, buf); +	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);  	btrfs_set_lock_blocking(buf);  	btrfs_set_buffer_uptodate(buf); @@ -5600,29 +7242,60 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	      struct btrfs_root *root, u32 blocksize)  {  	struct btrfs_block_rsv *block_rsv; +	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;  	int ret; +	bool global_updated = false;  	block_rsv = get_block_rsv(trans, root); -	if (block_rsv->size == 0) { -		ret = reserve_metadata_bytes(trans, root, block_rsv, -					     blocksize, 0); -		if (ret) -			return ERR_PTR(ret); +	if (unlikely(block_rsv->size == 0)) +		goto try_reserve; +again: +	ret = block_rsv_use_bytes(block_rsv, blocksize); +	if (!ret)  		return block_rsv; + +	if (block_rsv->failfast) +		return ERR_PTR(ret); + +	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { +		global_updated = true; +		update_global_block_rsv(root->fs_info); +		goto again;  	} -	ret = block_rsv_use_bytes(block_rsv, blocksize); +	if (btrfs_test_opt(root, ENOSPC_DEBUG)) { +		static DEFINE_RATELIMIT_STATE(_rs, +				DEFAULT_RATELIMIT_INTERVAL * 10, +				/*DEFAULT_RATELIMIT_BURST*/ 1); +		if (__ratelimit(&_rs)) +			WARN(1, KERN_DEBUG +				"BTRFS: block rsv returned %d\n", ret); +	} +try_reserve: +	ret = reserve_metadata_bytes(root, block_rsv, blocksize, +				     BTRFS_RESERVE_NO_FLUSH);  	if (!ret)  		return block_rsv; - -	return ERR_PTR(-ENOSPC); +	/* +	 * If we couldn't reserve metadata bytes try and use some from +	 * the global reserve if its space type is the same as the global +	 * reservation. +	 */ +	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && +	    block_rsv->space_info == global_rsv->space_info) { +		ret = block_rsv_use_bytes(global_rsv, blocksize); +		if (!ret) +			return global_rsv; +	} +	return ERR_PTR(ret);  } -static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, +			    struct btrfs_block_rsv *block_rsv, u32 blocksize)  {  	block_rsv_add_bytes(block_rsv, blocksize, 0); -	block_rsv_release_bytes(block_rsv, NULL, 0); +	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);  }  /* @@ -5643,22 +7316,32 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	struct extent_buffer *buf;  	u64 flags = 0;  	int ret; +	bool skinny_metadata = btrfs_fs_incompat(root->fs_info, +						 SKINNY_METADATA); - +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { +		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, +					    blocksize, level); +		if (!IS_ERR(buf)) +			root->alloc_bytenr += blocksize; +		return buf; +	} +#endif  	block_rsv = use_block_rsv(trans, root, blocksize);  	if (IS_ERR(block_rsv))  		return ERR_CAST(block_rsv); -	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, -				   empty_size, hint, (u64)-1, &ins, 0); +	ret = btrfs_reserve_extent(root, blocksize, blocksize, +				   empty_size, hint, &ins, 0, 0);  	if (ret) { -		unuse_block_rsv(block_rsv, blocksize); +		unuse_block_rsv(root->fs_info, block_rsv, blocksize);  		return ERR_PTR(ret);  	}  	buf = btrfs_init_new_buffer(trans, root, ins.objectid,  				    blocksize, level); -	BUG_ON(IS_ERR(buf)); +	BUG_ON(IS_ERR(buf)); /* -ENOMEM */  	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {  		if (parent == 0) @@ -5669,22 +7352,27 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,  	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {  		struct btrfs_delayed_extent_op *extent_op; -		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); -		BUG_ON(!extent_op); +		extent_op = btrfs_alloc_delayed_extent_op(); +		BUG_ON(!extent_op); /* -ENOMEM */  		if (key)  			memcpy(&extent_op->key, key, sizeof(extent_op->key));  		else  			memset(&extent_op->key, 0, sizeof(extent_op->key));  		extent_op->flags_to_set = flags; -		extent_op->update_key = 1; +		if (skinny_metadata) +			extent_op->update_key = 0; +		else +			extent_op->update_key = 1;  		extent_op->update_flags = 1;  		extent_op->is_data = 0; +		extent_op->level = level; -		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, +		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, +					ins.objectid,  					ins.offset, parent, root_objectid,  					level, BTRFS_ADD_DELAYED_EXTENT, -					extent_op); -		BUG_ON(ret); +					extent_op, 0); +		BUG_ON(ret); /* -ENOMEM */  	}  	return buf;  } @@ -5700,6 +7388,7 @@ struct walk_control {  	int keep_locks;  	int reada_slot;  	int reada_count; +	int for_reloc;  };  #define DROP_REFERENCE	1 @@ -5751,9 +7440,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,  			continue;  		/* We don't lock the tree block, it's OK to be racy here */ -		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, -					       &refs, &flags); -		BUG_ON(ret); +		ret = btrfs_lookup_extent_info(trans, root, bytenr, +					       wc->level - 1, 1, &refs, +					       &flags); +		/* We don't care about errors in readahead. */ +		if (ret < 0) +			continue;  		BUG_ON(refs == 0);  		if (wc->stage == DROP_REFERENCE) { @@ -5787,7 +7479,7 @@ reada:  }  /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree.   *   * when wc->stage == UPDATE_BACKREF, this function updates   * back refs for pointers in the block. @@ -5817,10 +7509,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {  		BUG_ON(!path->locks[level]);  		ret = btrfs_lookup_extent_info(trans, root, -					       eb->start, eb->len, +					       eb->start, level, 1,  					       &wc->refs[level],  					       &wc->flags[level]); -		BUG_ON(ret); +		BUG_ON(ret == -ENOMEM); +		if (ret) +			return ret;  		BUG_ON(wc->refs[level] == 0);  	} @@ -5829,7 +7523,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  			return 1;  		if (path->locks[level] && !wc->keep_locks) { -			btrfs_tree_unlock(eb); +			btrfs_tree_unlock_rw(eb, path->locks[level]);  			path->locks[level] = 0;  		}  		return 0; @@ -5838,13 +7532,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  	/* wc->stage == UPDATE_BACKREF */  	if (!(wc->flags[level] & flag)) {  		BUG_ON(!path->locks[level]); -		ret = btrfs_inc_ref(trans, root, eb, 1); -		BUG_ON(ret); -		ret = btrfs_dec_ref(trans, root, eb, 0); -		BUG_ON(ret); +		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); +		BUG_ON(ret); /* -ENOMEM */ +		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); +		BUG_ON(ret); /* -ENOMEM */  		ret = btrfs_set_disk_extent_flags(trans, root, eb->start, -						  eb->len, flag, 0); -		BUG_ON(ret); +						  eb->len, flag, +						  btrfs_header_level(eb), 0); +		BUG_ON(ret); /* -ENOMEM */  		wc->flags[level] |= flag;  	} @@ -5853,14 +7548,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  	 * keep the tree lock  	 */  	if (path->locks[level] && level > 0) { -		btrfs_tree_unlock(eb); +		btrfs_tree_unlock_rw(eb, path->locks[level]);  		path->locks[level] = 0;  	}  	return 0;  }  /* - * hepler to process tree block pointer. + * helper to process tree block pointer.   *   * when wc->stage == DROP_REFERENCE, this function checks   * reference count of the block pointed to. if the block @@ -5908,16 +7603,25 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  		next = btrfs_find_create_tree_block(root, bytenr, blocksize);  		if (!next)  			return -ENOMEM; +		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, +					       level - 1);  		reada = 1;  	}  	btrfs_tree_lock(next);  	btrfs_set_lock_blocking(next); -	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, +	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,  				       &wc->refs[level - 1],  				       &wc->flags[level - 1]); -	BUG_ON(ret); -	BUG_ON(wc->refs[level - 1] == 0); +	if (ret < 0) { +		btrfs_tree_unlock(next); +		return ret; +	} + +	if (unlikely(wc->refs[level - 1] == 0)) { +		btrfs_err(root->fs_info, "Missing references."); +		BUG(); +	}  	*lookup_info = 0;  	if (wc->stage == DROP_REFERENCE) { @@ -5945,7 +7649,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  			goto skip;  	} -	if (!btrfs_buffer_uptodate(next, generation)) { +	if (!btrfs_buffer_uptodate(next, generation, 0)) {  		btrfs_tree_unlock(next);  		free_extent_buffer(next);  		next = NULL; @@ -5956,6 +7660,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  		if (reada && level == 1)  			reada_walk_down(trans, root, wc, path);  		next = read_tree_block(root, bytenr, blocksize, generation); +		if (!next || !extent_buffer_uptodate(next)) { +			free_extent_buffer(next); +			return -EIO; +		}  		btrfs_tree_lock(next);  		btrfs_set_lock_blocking(next);  	} @@ -5964,7 +7672,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  	BUG_ON(level != btrfs_header_level(next));  	path->nodes[level] = next;  	path->slots[level] = 0; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  	wc->level = level;  	if (wc->level == 1)  		wc->reada_slot = 0; @@ -5982,8 +7690,8 @@ skip:  		}  		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, -					root->root_key.objectid, level - 1, 0); -		BUG_ON(ret); +				root->root_key.objectid, level - 1, 0, 0); +		BUG_ON(ret); /* -ENOMEM */  	}  	btrfs_tree_unlock(next);  	free_extent_buffer(next); @@ -5992,7 +7700,7 @@ skip:  }  /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree.   *   * when wc->stage == DROP_REFERENCE, this function drops   * reference count on the block. @@ -6035,16 +7743,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  			BUG_ON(level == 0);  			btrfs_tree_lock(eb);  			btrfs_set_lock_blocking(eb); -			path->locks[level] = 1; +			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  			ret = btrfs_lookup_extent_info(trans, root, -						       eb->start, eb->len, +						       eb->start, level, 1,  						       &wc->refs[level],  						       &wc->flags[level]); -			BUG_ON(ret); +			if (ret < 0) { +				btrfs_tree_unlock_rw(eb, path->locks[level]); +				path->locks[level] = 0; +				return ret; +			}  			BUG_ON(wc->refs[level] == 0);  			if (wc->refs[level] == 1) { -				btrfs_tree_unlock(eb); +				btrfs_tree_unlock_rw(eb, path->locks[level]);  				path->locks[level] = 0;  				return 1;  			} @@ -6057,17 +7769,19 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,  	if (wc->refs[level] == 1) {  		if (level == 0) {  			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) -				ret = btrfs_dec_ref(trans, root, eb, 1); +				ret = btrfs_dec_ref(trans, root, eb, 1, +						    wc->for_reloc);  			else -				ret = btrfs_dec_ref(trans, root, eb, 0); -			BUG_ON(ret); +				ret = btrfs_dec_ref(trans, root, eb, 0, +						    wc->for_reloc); +			BUG_ON(ret); /* -ENOMEM */  		}  		/* make block locked assertion in clean_tree_block happy */  		if (!path->locks[level] &&  		    btrfs_header_generation(eb) == trans->transid) {  			btrfs_tree_lock(eb);  			btrfs_set_lock_blocking(eb); -			path->locks[level] = 1; +			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  		}  		clean_tree_block(trans, root, eb);  	} @@ -6146,7 +7860,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,  				return 0;  			if (path->locks[level]) { -				btrfs_tree_unlock(path->nodes[level]); +				btrfs_tree_unlock_rw(path->nodes[level], +						     path->locks[level]);  				path->locks[level] = 0;  			}  			free_extent_buffer(path->nodes[level]); @@ -6167,9 +7882,12 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,   * reference count by one. if update_ref is true, this function   * also make sure backrefs for the shared block and all lower level   * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN   */  int btrfs_drop_snapshot(struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, int update_ref) +			 struct btrfs_block_rsv *block_rsv, int update_ref, +			 int for_reloc)  {  	struct btrfs_path *path;  	struct btrfs_trans_handle *trans; @@ -6180,14 +7898,27 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  	int err = 0;  	int ret;  	int level; +	bool root_dropped = false;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) { +		err = -ENOMEM; +		goto out; +	}  	wc = kzalloc(sizeof(*wc), GFP_NOFS); -	BUG_ON(!wc); +	if (!wc) { +		btrfs_free_path(path); +		err = -ENOMEM; +		goto out; +	}  	trans = btrfs_start_transaction(tree_root, 0); +	if (IS_ERR(trans)) { +		err = PTR_ERR(trans); +		goto out_free; +	} +  	if (block_rsv)  		trans->block_rsv = block_rsv; @@ -6196,7 +7927,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		path->nodes[level] = btrfs_lock_root_node(root);  		btrfs_set_lock_blocking(path->nodes[level]);  		path->slots[level] = 0; -		path->locks[level] = 1; +		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  		memset(&wc->update_progress, 0,  		       sizeof(wc->update_progress));  	} else { @@ -6211,7 +7942,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		path->lowest_level = 0;  		if (ret < 0) {  			err = ret; -			goto out; +			goto out_end_trans;  		}  		WARN_ON(ret > 0); @@ -6225,19 +7956,23 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		while (1) {  			btrfs_tree_lock(path->nodes[level]);  			btrfs_set_lock_blocking(path->nodes[level]); +			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  			ret = btrfs_lookup_extent_info(trans, root,  						path->nodes[level]->start, -						path->nodes[level]->len, -						&wc->refs[level], +						level, 1, &wc->refs[level],  						&wc->flags[level]); -			BUG_ON(ret); +			if (ret < 0) { +				err = ret; +				goto out_end_trans; +			}  			BUG_ON(wc->refs[level] == 0);  			if (level == root_item->drop_level)  				break;  			btrfs_tree_unlock(path->nodes[level]); +			path->locks[level] = 0;  			WARN_ON(wc->refs[level] != 1);  			level--;  		} @@ -6248,9 +7983,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  	wc->stage = DROP_REFERENCE;  	wc->update_ref = update_ref;  	wc->keep_locks = 0; +	wc->for_reloc = for_reloc;  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);  	while (1) { +  		ret = walk_down_tree(trans, root, path, wc);  		if (ret < 0) {  			err = ret; @@ -6277,46 +8014,86 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		}  		BUG_ON(wc->level == 0); -		if (btrfs_should_end_transaction(trans, tree_root)) { +		if (btrfs_should_end_transaction(trans, tree_root) || +		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {  			ret = btrfs_update_root(trans, tree_root,  						&root->root_key,  						root_item); -			BUG_ON(ret); +			if (ret) { +				btrfs_abort_transaction(trans, tree_root, ret); +				err = ret; +				goto out_end_trans; +			}  			btrfs_end_transaction_throttle(trans, tree_root); +			if (!for_reloc && btrfs_need_cleaner_sleep(root)) { +				pr_debug("BTRFS: drop snapshot early exit\n"); +				err = -EAGAIN; +				goto out_free; +			} +  			trans = btrfs_start_transaction(tree_root, 0); +			if (IS_ERR(trans)) { +				err = PTR_ERR(trans); +				goto out_free; +			}  			if (block_rsv)  				trans->block_rsv = block_rsv;  		}  	} -	btrfs_release_path(root, path); -	BUG_ON(err); +	btrfs_release_path(path); +	if (err) +		goto out_end_trans;  	ret = btrfs_del_root(trans, tree_root, &root->root_key); -	BUG_ON(ret); +	if (ret) { +		btrfs_abort_transaction(trans, tree_root, ret); +		goto out_end_trans; +	}  	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { -		ret = btrfs_find_last_root(tree_root, root->root_key.objectid, -					   NULL, NULL); -		BUG_ON(ret < 0); -		if (ret > 0) { -			ret = btrfs_del_orphan_item(trans, tree_root, -						    root->root_key.objectid); -			BUG_ON(ret); +		ret = btrfs_find_root(tree_root, &root->root_key, path, +				      NULL, NULL); +		if (ret < 0) { +			btrfs_abort_transaction(trans, tree_root, ret); +			err = ret; +			goto out_end_trans; +		} else if (ret > 0) { +			/* if we fail to delete the orphan item this time +			 * around, it'll get picked up the next time. +			 * +			 * The most common failure here is just -ENOENT. +			 */ +			btrfs_del_orphan_item(trans, tree_root, +					      root->root_key.objectid);  		}  	} -	if (root->in_radix) { -		btrfs_free_fs_root(tree_root->fs_info, root); +	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { +		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);  	} else {  		free_extent_buffer(root->node);  		free_extent_buffer(root->commit_root); -		kfree(root); +		btrfs_put_fs_root(root);  	} -out: +	root_dropped = true; +out_end_trans:  	btrfs_end_transaction_throttle(trans, tree_root); +out_free:  	kfree(wc);  	btrfs_free_path(path); +out: +	/* +	 * So if we need to stop dropping the snapshot for whatever reason we +	 * need to make sure to add it back to the dead root list so that we +	 * keep trying to do the work later.  This also cleans up roots if we +	 * don't have it in the radix (like when we recover after a power fail +	 * or unmount) so we don't leak memory. +	 */ +	if (!for_reloc && root_dropped == false) +		btrfs_add_dead_root(root); +	if (err && err != -EAGAIN) +		btrfs_std_error(root->fs_info, err);  	return err;  } @@ -6324,6 +8101,7 @@ out:   * drop subtree rooted at tree block 'node'.   *   * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code   */  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  			struct btrfs_root *root, @@ -6340,10 +8118,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM;  	wc = kzalloc(sizeof(*wc), GFP_NOFS); -	BUG_ON(!wc); +	if (!wc) { +		btrfs_free_path(path); +		return -ENOMEM; +	}  	btrfs_assert_tree_locked(parent);  	parent_level = btrfs_header_level(parent); @@ -6355,7 +8137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  	level = btrfs_header_level(node);  	path->nodes[level] = node;  	path->slots[level] = 0; -	path->locks[level] = 1; +	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;  	wc->refs[parent_level] = 1;  	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -6364,6 +8146,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  	wc->stage = DROP_REFERENCE;  	wc->update_ref = 0;  	wc->keep_locks = 1; +	wc->for_reloc = 1;  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);  	while (1) { @@ -6385,1500 +8168,31 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  	return ret;  } -#if 0 -static unsigned long calc_ra(unsigned long start, unsigned long last, -			     unsigned long nr) -{ -	return min(last, start + nr - 1); -} - -static noinline int relocate_inode_pages(struct inode *inode, u64 start, -					 u64 len) -{ -	u64 page_start; -	u64 page_end; -	unsigned long first_index; -	unsigned long last_index; -	unsigned long i; -	struct page *page; -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -	struct file_ra_state *ra; -	struct btrfs_ordered_extent *ordered; -	unsigned int total_read = 0; -	unsigned int total_dirty = 0; -	int ret = 0; - -	ra = kzalloc(sizeof(*ra), GFP_NOFS); - -	mutex_lock(&inode->i_mutex); -	first_index = start >> PAGE_CACHE_SHIFT; -	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - -	/* make sure the dirty trick played by the caller work */ -	ret = invalidate_inode_pages2_range(inode->i_mapping, -					    first_index, last_index); -	if (ret) -		goto out_unlock; - -	file_ra_state_init(ra, inode->i_mapping); - -	for (i = first_index ; i <= last_index; i++) { -		if (total_read % ra->ra_pages == 0) { -			btrfs_force_ra(inode->i_mapping, ra, NULL, i, -				       calc_ra(i, last_index, ra->ra_pages)); -		} -		total_read++; -again: -		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) -			BUG_ON(1); -		page = grab_cache_page(inode->i_mapping, i); -		if (!page) { -			ret = -ENOMEM; -			goto out_unlock; -		} -		if (!PageUptodate(page)) { -			btrfs_readpage(NULL, page); -			lock_page(page); -			if (!PageUptodate(page)) { -				unlock_page(page); -				page_cache_release(page); -				ret = -EIO; -				goto out_unlock; -			} -		} -		wait_on_page_writeback(page); - -		page_start = (u64)page->index << PAGE_CACHE_SHIFT; -		page_end = page_start + PAGE_CACHE_SIZE - 1; -		lock_extent(io_tree, page_start, page_end, GFP_NOFS); - -		ordered = btrfs_lookup_ordered_extent(inode, page_start); -		if (ordered) { -			unlock_extent(io_tree, page_start, page_end, GFP_NOFS); -			unlock_page(page); -			page_cache_release(page); -			btrfs_start_ordered_extent(inode, ordered, 1); -			btrfs_put_ordered_extent(ordered); -			goto again; -		} -		set_page_extent_mapped(page); - -		if (i == first_index) -			set_extent_bits(io_tree, page_start, page_end, -					EXTENT_BOUNDARY, GFP_NOFS); -		btrfs_set_extent_delalloc(inode, page_start, page_end); - -		set_page_dirty(page); -		total_dirty++; - -		unlock_extent(io_tree, page_start, page_end, GFP_NOFS); -		unlock_page(page); -		page_cache_release(page); -	} - -out_unlock: -	kfree(ra); -	mutex_unlock(&inode->i_mutex); -	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); -	return ret; -} - -static noinline int relocate_data_extent(struct inode *reloc_inode, -					 struct btrfs_key *extent_key, -					 u64 offset) -{ -	struct btrfs_root *root = BTRFS_I(reloc_inode)->root; -	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; -	struct extent_map *em; -	u64 start = extent_key->objectid - offset; -	u64 end = start + extent_key->offset - 1; - -	em = alloc_extent_map(GFP_NOFS); -	BUG_ON(!em || IS_ERR(em)); - -	em->start = start; -	em->len = extent_key->offset; -	em->block_len = extent_key->offset; -	em->block_start = extent_key->objectid; -	em->bdev = root->fs_info->fs_devices->latest_bdev; -	set_bit(EXTENT_FLAG_PINNED, &em->flags); - -	/* setup extent map to cheat btrfs_readpage */ -	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); -	while (1) { -		int ret; -		write_lock(&em_tree->lock); -		ret = add_extent_mapping(em_tree, em); -		write_unlock(&em_tree->lock); -		if (ret != -EEXIST) { -			free_extent_map(em); -			break; -		} -		btrfs_drop_extent_cache(reloc_inode, start, end, 0); -	} -	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); - -	return relocate_inode_pages(reloc_inode, start, extent_key->offset); -} - -struct btrfs_ref_path { -	u64 extent_start; -	u64 nodes[BTRFS_MAX_LEVEL]; -	u64 root_objectid; -	u64 root_generation; -	u64 owner_objectid; -	u32 num_refs; -	int lowest_level; -	int current_level; -	int shared_level; - -	struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; -	u64 new_nodes[BTRFS_MAX_LEVEL]; -}; - -struct disk_extent { -	u64 ram_bytes; -	u64 disk_bytenr; -	u64 disk_num_bytes; -	u64 offset; -	u64 num_bytes; -	u8 compression; -	u8 encryption; -	u16 other_encoding; -}; - -static int is_cowonly_root(u64 root_objectid) -{ -	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || -	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID || -	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID || -	    root_objectid == BTRFS_DEV_TREE_OBJECTID || -	    root_objectid == BTRFS_TREE_LOG_OBJECTID || -	    root_objectid == BTRFS_CSUM_TREE_OBJECTID) -		return 1; -	return 0; -} - -static noinline int __next_ref_path(struct btrfs_trans_handle *trans, -				    struct btrfs_root *extent_root, -				    struct btrfs_ref_path *ref_path, -				    int first_time) -{ -	struct extent_buffer *leaf; -	struct btrfs_path *path; -	struct btrfs_extent_ref *ref; -	struct btrfs_key key; -	struct btrfs_key found_key; -	u64 bytenr; -	u32 nritems; -	int level; -	int ret = 1; - -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; - -	if (first_time) { -		ref_path->lowest_level = -1; -		ref_path->current_level = -1; -		ref_path->shared_level = -1; -		goto walk_up; -	} -walk_down: -	level = ref_path->current_level - 1; -	while (level >= -1) { -		u64 parent; -		if (level < ref_path->lowest_level) -			break; - -		if (level >= 0) -			bytenr = ref_path->nodes[level]; -		else -			bytenr = ref_path->extent_start; -		BUG_ON(bytenr == 0); - -		parent = ref_path->nodes[level + 1]; -		ref_path->nodes[level + 1] = 0; -		ref_path->current_level = level; -		BUG_ON(parent == 0); - -		key.objectid = bytenr; -		key.offset = parent + 1; -		key.type = BTRFS_EXTENT_REF_KEY; - -		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); -		if (ret < 0) -			goto out; -		BUG_ON(ret == 0); - -		leaf = path->nodes[0]; -		nritems = btrfs_header_nritems(leaf); -		if (path->slots[0] >= nritems) { -			ret = btrfs_next_leaf(extent_root, path); -			if (ret < 0) -				goto out; -			if (ret > 0) -				goto next; -			leaf = path->nodes[0]; -		} - -		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.objectid == bytenr && -		    found_key.type == BTRFS_EXTENT_REF_KEY) { -			if (level < ref_path->shared_level) -				ref_path->shared_level = level; -			goto found; -		} -next: -		level--; -		btrfs_release_path(extent_root, path); -		cond_resched(); -	} -	/* reached lowest level */ -	ret = 1; -	goto out; -walk_up: -	level = ref_path->current_level; -	while (level < BTRFS_MAX_LEVEL - 1) { -		u64 ref_objectid; - -		if (level >= 0) -			bytenr = ref_path->nodes[level]; -		else -			bytenr = ref_path->extent_start; - -		BUG_ON(bytenr == 0); - -		key.objectid = bytenr; -		key.offset = 0; -		key.type = BTRFS_EXTENT_REF_KEY; - -		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); -		if (ret < 0) -			goto out; - -		leaf = path->nodes[0]; -		nritems = btrfs_header_nritems(leaf); -		if (path->slots[0] >= nritems) { -			ret = btrfs_next_leaf(extent_root, path); -			if (ret < 0) -				goto out; -			if (ret > 0) { -				/* the extent was freed by someone */ -				if (ref_path->lowest_level == level) -					goto out; -				btrfs_release_path(extent_root, path); -				goto walk_down; -			} -			leaf = path->nodes[0]; -		} - -		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.objectid != bytenr || -				found_key.type != BTRFS_EXTENT_REF_KEY) { -			/* the extent was freed by someone */ -			if (ref_path->lowest_level == level) { -				ret = 1; -				goto out; -			} -			btrfs_release_path(extent_root, path); -			goto walk_down; -		} -found: -		ref = btrfs_item_ptr(leaf, path->slots[0], -				struct btrfs_extent_ref); -		ref_objectid = btrfs_ref_objectid(leaf, ref); -		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { -			if (first_time) { -				level = (int)ref_objectid; -				BUG_ON(level >= BTRFS_MAX_LEVEL); -				ref_path->lowest_level = level; -				ref_path->current_level = level; -				ref_path->nodes[level] = bytenr; -			} else { -				WARN_ON(ref_objectid != level); -			} -		} else { -			WARN_ON(level != -1); -		} -		first_time = 0; - -		if (ref_path->lowest_level == level) { -			ref_path->owner_objectid = ref_objectid; -			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); -		} - -		/* -		 * the block is tree root or the block isn't in reference -		 * counted tree. -		 */ -		if (found_key.objectid == found_key.offset || -		    is_cowonly_root(btrfs_ref_root(leaf, ref))) { -			ref_path->root_objectid = btrfs_ref_root(leaf, ref); -			ref_path->root_generation = -				btrfs_ref_generation(leaf, ref); -			if (level < 0) { -				/* special reference from the tree log */ -				ref_path->nodes[0] = found_key.offset; -				ref_path->current_level = 0; -			} -			ret = 0; -			goto out; -		} - -		level++; -		BUG_ON(ref_path->nodes[level] != 0); -		ref_path->nodes[level] = found_key.offset; -		ref_path->current_level = level; - -		/* -		 * the reference was created in the running transaction, -		 * no need to continue walking up. -		 */ -		if (btrfs_ref_generation(leaf, ref) == trans->transid) { -			ref_path->root_objectid = btrfs_ref_root(leaf, ref); -			ref_path->root_generation = -				btrfs_ref_generation(leaf, ref); -			ret = 0; -			goto out; -		} - -		btrfs_release_path(extent_root, path); -		cond_resched(); -	} -	/* reached max tree level, but no tree root found. */ -	BUG(); -out: -	btrfs_free_path(path); -	return ret; -} - -static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, -				struct btrfs_root *extent_root, -				struct btrfs_ref_path *ref_path, -				u64 extent_start) -{ -	memset(ref_path, 0, sizeof(*ref_path)); -	ref_path->extent_start = extent_start; - -	return __next_ref_path(trans, extent_root, ref_path, 1); -} - -static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, -			       struct btrfs_root *extent_root, -			       struct btrfs_ref_path *ref_path) -{ -	return __next_ref_path(trans, extent_root, ref_path, 0); -} - -static noinline int get_new_locations(struct inode *reloc_inode, -				      struct btrfs_key *extent_key, -				      u64 offset, int no_fragment, -				      struct disk_extent **extents, -				      int *nr_extents) -{ -	struct btrfs_root *root = BTRFS_I(reloc_inode)->root; -	struct btrfs_path *path; -	struct btrfs_file_extent_item *fi; -	struct extent_buffer *leaf; -	struct disk_extent *exts = *extents; -	struct btrfs_key found_key; -	u64 cur_pos; -	u64 last_byte; -	u32 nritems; -	int nr = 0; -	int max = *nr_extents; -	int ret; - -	WARN_ON(!no_fragment && *extents); -	if (!exts) { -		max = 1; -		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); -		if (!exts) -			return -ENOMEM; -	} - -	path = btrfs_alloc_path(); -	BUG_ON(!path); - -	cur_pos = extent_key->objectid - offset; -	last_byte = extent_key->objectid + extent_key->offset; -	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, -				       cur_pos, 0); -	if (ret < 0) -		goto out; -	if (ret > 0) { -		ret = -ENOENT; -		goto out; -	} - -	while (1) { -		leaf = path->nodes[0]; -		nritems = btrfs_header_nritems(leaf); -		if (path->slots[0] >= nritems) { -			ret = btrfs_next_leaf(root, path); -			if (ret < 0) -				goto out; -			if (ret > 0) -				break; -			leaf = path->nodes[0]; -		} - -		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.offset != cur_pos || -		    found_key.type != BTRFS_EXTENT_DATA_KEY || -		    found_key.objectid != reloc_inode->i_ino) -			break; - -		fi = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); -		if (btrfs_file_extent_type(leaf, fi) != -		    BTRFS_FILE_EXTENT_REG || -		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0) -			break; - -		if (nr == max) { -			struct disk_extent *old = exts; -			max *= 2; -			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); -			memcpy(exts, old, sizeof(*exts) * nr); -			if (old != *extents) -				kfree(old); -		} - -		exts[nr].disk_bytenr = -			btrfs_file_extent_disk_bytenr(leaf, fi); -		exts[nr].disk_num_bytes = -			btrfs_file_extent_disk_num_bytes(leaf, fi); -		exts[nr].offset = btrfs_file_extent_offset(leaf, fi); -		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); -		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); -		exts[nr].compression = btrfs_file_extent_compression(leaf, fi); -		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); -		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, -									   fi); -		BUG_ON(exts[nr].offset > 0); -		BUG_ON(exts[nr].compression || exts[nr].encryption); -		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); - -		cur_pos += exts[nr].num_bytes; -		nr++; - -		if (cur_pos + offset >= last_byte) -			break; - -		if (no_fragment) { -			ret = 1; -			goto out; -		} -		path->slots[0]++; -	} - -	BUG_ON(cur_pos + offset > last_byte); -	if (cur_pos + offset < last_byte) { -		ret = -ENOENT; -		goto out; -	} -	ret = 0; -out: -	btrfs_free_path(path); -	if (ret) { -		if (exts != *extents) -			kfree(exts); -	} else { -		*extents = exts; -		*nr_extents = nr; -	} -	return ret; -} - -static noinline int replace_one_extent(struct btrfs_trans_handle *trans, -					struct btrfs_root *root, -					struct btrfs_path *path, -					struct btrfs_key *extent_key, -					struct btrfs_key *leaf_key, -					struct btrfs_ref_path *ref_path, -					struct disk_extent *new_extents, -					int nr_extents) -{ -	struct extent_buffer *leaf; -	struct btrfs_file_extent_item *fi; -	struct inode *inode = NULL; -	struct btrfs_key key; -	u64 lock_start = 0; -	u64 lock_end = 0; -	u64 num_bytes; -	u64 ext_offset; -	u64 search_end = (u64)-1; -	u32 nritems; -	int nr_scaned = 0; -	int extent_locked = 0; -	int extent_type; -	int ret; - -	memcpy(&key, leaf_key, sizeof(key)); -	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { -		if (key.objectid < ref_path->owner_objectid || -		    (key.objectid == ref_path->owner_objectid && -		     key.type < BTRFS_EXTENT_DATA_KEY)) { -			key.objectid = ref_path->owner_objectid; -			key.type = BTRFS_EXTENT_DATA_KEY; -			key.offset = 0; -		} -	} - -	while (1) { -		ret = btrfs_search_slot(trans, root, &key, path, 0, 1); -		if (ret < 0) -			goto out; - -		leaf = path->nodes[0]; -		nritems = btrfs_header_nritems(leaf); -next: -		if (extent_locked && ret > 0) { -			/* -			 * the file extent item was modified by someone -			 * before the extent got locked. -			 */ -			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, -				      lock_end, GFP_NOFS); -			extent_locked = 0; -		} - -		if (path->slots[0] >= nritems) { -			if (++nr_scaned > 2) -				break; - -			BUG_ON(extent_locked); -			ret = btrfs_next_leaf(root, path); -			if (ret < 0) -				goto out; -			if (ret > 0) -				break; -			leaf = path->nodes[0]; -			nritems = btrfs_header_nritems(leaf); -		} - -		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - -		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { -			if ((key.objectid > ref_path->owner_objectid) || -			    (key.objectid == ref_path->owner_objectid && -			     key.type > BTRFS_EXTENT_DATA_KEY) || -			    key.offset >= search_end) -				break; -		} - -		if (inode && key.objectid != inode->i_ino) { -			BUG_ON(extent_locked); -			btrfs_release_path(root, path); -			mutex_unlock(&inode->i_mutex); -			iput(inode); -			inode = NULL; -			continue; -		} - -		if (key.type != BTRFS_EXTENT_DATA_KEY) { -			path->slots[0]++; -			ret = 1; -			goto next; -		} -		fi = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); -		extent_type = btrfs_file_extent_type(leaf, fi); -		if ((extent_type != BTRFS_FILE_EXTENT_REG && -		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) || -		    (btrfs_file_extent_disk_bytenr(leaf, fi) != -		     extent_key->objectid)) { -			path->slots[0]++; -			ret = 1; -			goto next; -		} - -		num_bytes = btrfs_file_extent_num_bytes(leaf, fi); -		ext_offset = btrfs_file_extent_offset(leaf, fi); - -		if (search_end == (u64)-1) { -			search_end = key.offset - ext_offset + -				btrfs_file_extent_ram_bytes(leaf, fi); -		} - -		if (!extent_locked) { -			lock_start = key.offset; -			lock_end = lock_start + num_bytes - 1; -		} else { -			if (lock_start > key.offset || -			    lock_end + 1 < key.offset + num_bytes) { -				unlock_extent(&BTRFS_I(inode)->io_tree, -					      lock_start, lock_end, GFP_NOFS); -				extent_locked = 0; -			} -		} - -		if (!inode) { -			btrfs_release_path(root, path); - -			inode = btrfs_iget_locked(root->fs_info->sb, -						  key.objectid, root); -			if (inode->i_state & I_NEW) { -				BTRFS_I(inode)->root = root; -				BTRFS_I(inode)->location.objectid = -					key.objectid; -				BTRFS_I(inode)->location.type = -					BTRFS_INODE_ITEM_KEY; -				BTRFS_I(inode)->location.offset = 0; -				btrfs_read_locked_inode(inode); -				unlock_new_inode(inode); -			} -			/* -			 * some code call btrfs_commit_transaction while -			 * holding the i_mutex, so we can't use mutex_lock -			 * here. -			 */ -			if (is_bad_inode(inode) || -			    !mutex_trylock(&inode->i_mutex)) { -				iput(inode); -				inode = NULL; -				key.offset = (u64)-1; -				goto skip; -			} -		} - -		if (!extent_locked) { -			struct btrfs_ordered_extent *ordered; - -			btrfs_release_path(root, path); - -			lock_extent(&BTRFS_I(inode)->io_tree, lock_start, -				    lock_end, GFP_NOFS); -			ordered = btrfs_lookup_first_ordered_extent(inode, -								    lock_end); -			if (ordered && -			    ordered->file_offset <= lock_end && -			    ordered->file_offset + ordered->len > lock_start) { -				unlock_extent(&BTRFS_I(inode)->io_tree, -					      lock_start, lock_end, GFP_NOFS); -				btrfs_start_ordered_extent(inode, ordered, 1); -				btrfs_put_ordered_extent(ordered); -				key.offset += num_bytes; -				goto skip; -			} -			if (ordered) -				btrfs_put_ordered_extent(ordered); - -			extent_locked = 1; -			continue; -		} - -		if (nr_extents == 1) { -			/* update extent pointer in place */ -			btrfs_set_file_extent_disk_bytenr(leaf, fi, -						new_extents[0].disk_bytenr); -			btrfs_set_file_extent_disk_num_bytes(leaf, fi, -						new_extents[0].disk_num_bytes); -			btrfs_mark_buffer_dirty(leaf); - -			btrfs_drop_extent_cache(inode, key.offset, -						key.offset + num_bytes - 1, 0); - -			ret = btrfs_inc_extent_ref(trans, root, -						new_extents[0].disk_bytenr, -						new_extents[0].disk_num_bytes, -						leaf->start, -						root->root_key.objectid, -						trans->transid, -						key.objectid); -			BUG_ON(ret); - -			ret = btrfs_free_extent(trans, root, -						extent_key->objectid, -						extent_key->offset, -						leaf->start, -						btrfs_header_owner(leaf), -						btrfs_header_generation(leaf), -						key.objectid, 0); -			BUG_ON(ret); - -			btrfs_release_path(root, path); -			key.offset += num_bytes; -		} else { -			BUG_ON(1); -#if 0 -			u64 alloc_hint; -			u64 extent_len; -			int i; -			/* -			 * drop old extent pointer at first, then insert the -			 * new pointers one bye one -			 */ -			btrfs_release_path(root, path); -			ret = btrfs_drop_extents(trans, root, inode, key.offset, -						 key.offset + num_bytes, -						 key.offset, &alloc_hint); -			BUG_ON(ret); - -			for (i = 0; i < nr_extents; i++) { -				if (ext_offset >= new_extents[i].num_bytes) { -					ext_offset -= new_extents[i].num_bytes; -					continue; -				} -				extent_len = min(new_extents[i].num_bytes - -						 ext_offset, num_bytes); - -				ret = btrfs_insert_empty_item(trans, root, -							      path, &key, -							      sizeof(*fi)); -				BUG_ON(ret); - -				leaf = path->nodes[0]; -				fi = btrfs_item_ptr(leaf, path->slots[0], -						struct btrfs_file_extent_item); -				btrfs_set_file_extent_generation(leaf, fi, -							trans->transid); -				btrfs_set_file_extent_type(leaf, fi, -							BTRFS_FILE_EXTENT_REG); -				btrfs_set_file_extent_disk_bytenr(leaf, fi, -						new_extents[i].disk_bytenr); -				btrfs_set_file_extent_disk_num_bytes(leaf, fi, -						new_extents[i].disk_num_bytes); -				btrfs_set_file_extent_ram_bytes(leaf, fi, -						new_extents[i].ram_bytes); - -				btrfs_set_file_extent_compression(leaf, fi, -						new_extents[i].compression); -				btrfs_set_file_extent_encryption(leaf, fi, -						new_extents[i].encryption); -				btrfs_set_file_extent_other_encoding(leaf, fi, -						new_extents[i].other_encoding); - -				btrfs_set_file_extent_num_bytes(leaf, fi, -							extent_len); -				ext_offset += new_extents[i].offset; -				btrfs_set_file_extent_offset(leaf, fi, -							ext_offset); -				btrfs_mark_buffer_dirty(leaf); - -				btrfs_drop_extent_cache(inode, key.offset, -						key.offset + extent_len - 1, 0); - -				ret = btrfs_inc_extent_ref(trans, root, -						new_extents[i].disk_bytenr, -						new_extents[i].disk_num_bytes, -						leaf->start, -						root->root_key.objectid, -						trans->transid, key.objectid); -				BUG_ON(ret); -				btrfs_release_path(root, path); - -				inode_add_bytes(inode, extent_len); - -				ext_offset = 0; -				num_bytes -= extent_len; -				key.offset += extent_len; - -				if (num_bytes == 0) -					break; -			} -			BUG_ON(i >= nr_extents); -#endif -		} - -		if (extent_locked) { -			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, -				      lock_end, GFP_NOFS); -			extent_locked = 0; -		} -skip: -		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && -		    key.offset >= search_end) -			break; - -		cond_resched(); -	} -	ret = 0; -out: -	btrfs_release_path(root, path); -	if (inode) { -		mutex_unlock(&inode->i_mutex); -		if (extent_locked) { -			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, -				      lock_end, GFP_NOFS); -		} -		iput(inode); -	} -	return ret; -} - -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, -			       struct btrfs_root *root, -			       struct extent_buffer *buf, u64 orig_start) -{ -	int level; -	int ret; - -	BUG_ON(btrfs_header_generation(buf) != trans->transid); -	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - -	level = btrfs_header_level(buf); -	if (level == 0) { -		struct btrfs_leaf_ref *ref; -		struct btrfs_leaf_ref *orig_ref; - -		orig_ref = btrfs_lookup_leaf_ref(root, orig_start); -		if (!orig_ref) -			return -ENOENT; - -		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); -		if (!ref) { -			btrfs_free_leaf_ref(root, orig_ref); -			return -ENOMEM; -		} - -		ref->nritems = orig_ref->nritems; -		memcpy(ref->extents, orig_ref->extents, -			sizeof(ref->extents[0]) * ref->nritems); - -		btrfs_free_leaf_ref(root, orig_ref); - -		ref->root_gen = trans->transid; -		ref->bytenr = buf->start; -		ref->owner = btrfs_header_owner(buf); -		ref->generation = btrfs_header_generation(buf); - -		ret = btrfs_add_leaf_ref(root, ref, 0); -		WARN_ON(ret); -		btrfs_free_leaf_ref(root, ref); -	} -	return 0; -} - -static noinline int invalidate_extent_cache(struct btrfs_root *root, -					struct extent_buffer *leaf, -					struct btrfs_block_group_cache *group, -					struct btrfs_root *target_root) -{ -	struct btrfs_key key; -	struct inode *inode = NULL; -	struct btrfs_file_extent_item *fi; -	struct extent_state *cached_state = NULL; -	u64 num_bytes; -	u64 skip_objectid = 0; -	u32 nritems; -	u32 i; - -	nritems = btrfs_header_nritems(leaf); -	for (i = 0; i < nritems; i++) { -		btrfs_item_key_to_cpu(leaf, &key, i); -		if (key.objectid == skip_objectid || -		    key.type != BTRFS_EXTENT_DATA_KEY) -			continue; -		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); -		if (btrfs_file_extent_type(leaf, fi) == -		    BTRFS_FILE_EXTENT_INLINE) -			continue; -		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) -			continue; -		if (!inode || inode->i_ino != key.objectid) { -			iput(inode); -			inode = btrfs_ilookup(target_root->fs_info->sb, -					      key.objectid, target_root, 1); -		} -		if (!inode) { -			skip_objectid = key.objectid; -			continue; -		} -		num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - -		lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset, -				 key.offset + num_bytes - 1, 0, &cached_state, -				 GFP_NOFS); -		btrfs_drop_extent_cache(inode, key.offset, -					key.offset + num_bytes - 1, 1); -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset, -				     key.offset + num_bytes - 1, &cached_state, -				     GFP_NOFS); -		cond_resched(); -	} -	iput(inode); -	return 0; -} - -static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, -					struct btrfs_root *root, -					struct extent_buffer *leaf, -					struct btrfs_block_group_cache *group, -					struct inode *reloc_inode) -{ -	struct btrfs_key key; -	struct btrfs_key extent_key; -	struct btrfs_file_extent_item *fi; -	struct btrfs_leaf_ref *ref; -	struct disk_extent *new_extent; -	u64 bytenr; -	u64 num_bytes; -	u32 nritems; -	u32 i; -	int ext_index; -	int nr_extent; -	int ret; - -	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); -	BUG_ON(!new_extent); - -	ref = btrfs_lookup_leaf_ref(root, leaf->start); -	BUG_ON(!ref); - -	ext_index = -1; -	nritems = btrfs_header_nritems(leaf); -	for (i = 0; i < nritems; i++) { -		btrfs_item_key_to_cpu(leaf, &key, i); -		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) -			continue; -		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); -		if (btrfs_file_extent_type(leaf, fi) == -		    BTRFS_FILE_EXTENT_INLINE) -			continue; -		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); -		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); -		if (bytenr == 0) -			continue; - -		ext_index++; -		if (bytenr >= group->key.objectid + group->key.offset || -		    bytenr + num_bytes <= group->key.objectid) -			continue; - -		extent_key.objectid = bytenr; -		extent_key.offset = num_bytes; -		extent_key.type = BTRFS_EXTENT_ITEM_KEY; -		nr_extent = 1; -		ret = get_new_locations(reloc_inode, &extent_key, -					group->key.objectid, 1, -					&new_extent, &nr_extent); -		if (ret > 0) -			continue; -		BUG_ON(ret < 0); - -		BUG_ON(ref->extents[ext_index].bytenr != bytenr); -		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); -		ref->extents[ext_index].bytenr = new_extent->disk_bytenr; -		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; - -		btrfs_set_file_extent_disk_bytenr(leaf, fi, -						new_extent->disk_bytenr); -		btrfs_set_file_extent_disk_num_bytes(leaf, fi, -						new_extent->disk_num_bytes); -		btrfs_mark_buffer_dirty(leaf); - -		ret = btrfs_inc_extent_ref(trans, root, -					new_extent->disk_bytenr, -					new_extent->disk_num_bytes, -					leaf->start, -					root->root_key.objectid, -					trans->transid, key.objectid); -		BUG_ON(ret); - -		ret = btrfs_free_extent(trans, root, -					bytenr, num_bytes, leaf->start, -					btrfs_header_owner(leaf), -					btrfs_header_generation(leaf), -					key.objectid, 0); -		BUG_ON(ret); -		cond_resched(); -	} -	kfree(new_extent); -	BUG_ON(ext_index + 1 != ref->nritems); -	btrfs_free_leaf_ref(root, ref); -	return 0; -} - -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, -			  struct btrfs_root *root) -{ -	struct btrfs_root *reloc_root; -	int ret; - -	if (root->reloc_root) { -		reloc_root = root->reloc_root; -		root->reloc_root = NULL; -		list_add(&reloc_root->dead_list, -			 &root->fs_info->dead_reloc_roots); - -		btrfs_set_root_bytenr(&reloc_root->root_item, -				      reloc_root->node->start); -		btrfs_set_root_level(&root->root_item, -				     btrfs_header_level(reloc_root->node)); -		memset(&reloc_root->root_item.drop_progress, 0, -			sizeof(struct btrfs_disk_key)); -		reloc_root->root_item.drop_level = 0; - -		ret = btrfs_update_root(trans, root->fs_info->tree_root, -					&reloc_root->root_key, -					&reloc_root->root_item); -		BUG_ON(ret); -	} -	return 0; -} - -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) -{ -	struct btrfs_trans_handle *trans; -	struct btrfs_root *reloc_root; -	struct btrfs_root *prev_root = NULL; -	struct list_head dead_roots; -	int ret; -	unsigned long nr; - -	INIT_LIST_HEAD(&dead_roots); -	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); - -	while (!list_empty(&dead_roots)) { -		reloc_root = list_entry(dead_roots.prev, -					struct btrfs_root, dead_list); -		list_del_init(&reloc_root->dead_list); - -		BUG_ON(reloc_root->commit_root != NULL); -		while (1) { -			trans = btrfs_join_transaction(root, 1); -			BUG_ON(!trans); - -			mutex_lock(&root->fs_info->drop_mutex); -			ret = btrfs_drop_snapshot(trans, reloc_root); -			if (ret != -EAGAIN) -				break; -			mutex_unlock(&root->fs_info->drop_mutex); - -			nr = trans->blocks_used; -			ret = btrfs_end_transaction(trans, root); -			BUG_ON(ret); -			btrfs_btree_balance_dirty(root, nr); -		} - -		free_extent_buffer(reloc_root->node); - -		ret = btrfs_del_root(trans, root->fs_info->tree_root, -				     &reloc_root->root_key); -		BUG_ON(ret); -		mutex_unlock(&root->fs_info->drop_mutex); - -		nr = trans->blocks_used; -		ret = btrfs_end_transaction(trans, root); -		BUG_ON(ret); -		btrfs_btree_balance_dirty(root, nr); - -		kfree(prev_root); -		prev_root = reloc_root; -	} -	if (prev_root) { -		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); -		kfree(prev_root); -	} -	return 0; -} - -int btrfs_add_dead_reloc_root(struct btrfs_root *root) -{ -	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); -	return 0; -} - -int btrfs_cleanup_reloc_trees(struct btrfs_root *root) -{ -	struct btrfs_root *reloc_root; -	struct btrfs_trans_handle *trans; -	struct btrfs_key location; -	int found; -	int ret; - -	mutex_lock(&root->fs_info->tree_reloc_mutex); -	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); -	BUG_ON(ret); -	found = !list_empty(&root->fs_info->dead_reloc_roots); -	mutex_unlock(&root->fs_info->tree_reloc_mutex); - -	if (found) { -		trans = btrfs_start_transaction(root, 1); -		BUG_ON(!trans); -		ret = btrfs_commit_transaction(trans, root); -		BUG_ON(ret); -	} - -	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; -	location.offset = (u64)-1; -	location.type = BTRFS_ROOT_ITEM_KEY; - -	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); -	BUG_ON(!reloc_root); -	btrfs_orphan_cleanup(reloc_root); -	return 0; -} - -static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, -				    struct btrfs_root *root) +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  { -	struct btrfs_root *reloc_root; -	struct extent_buffer *eb; -	struct btrfs_root_item *root_item; -	struct btrfs_key root_key; -	int ret; - -	BUG_ON(!root->ref_cows); -	if (root->reloc_root) -		return 0; - -	root_item = kmalloc(sizeof(*root_item), GFP_NOFS); -	BUG_ON(!root_item); - -	ret = btrfs_copy_root(trans, root, root->commit_root, -			      &eb, BTRFS_TREE_RELOC_OBJECTID); -	BUG_ON(ret); - -	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; -	root_key.offset = root->root_key.objectid; -	root_key.type = BTRFS_ROOT_ITEM_KEY; - -	memcpy(root_item, &root->root_item, sizeof(root_item)); -	btrfs_set_root_refs(root_item, 0); -	btrfs_set_root_bytenr(root_item, eb->start); -	btrfs_set_root_level(root_item, btrfs_header_level(eb)); -	btrfs_set_root_generation(root_item, trans->transid); - -	btrfs_tree_unlock(eb); -	free_extent_buffer(eb); - -	ret = btrfs_insert_root(trans, root->fs_info->tree_root, -				&root_key, root_item); -	BUG_ON(ret); -	kfree(root_item); - -	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, -						 &root_key); -	BUG_ON(!reloc_root); -	reloc_root->last_trans = trans->transid; -	reloc_root->commit_root = NULL; -	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; - -	root->reloc_root = reloc_root; -	return 0; -} - -/* - * Core function of space balance. - * - * The idea is using reloc trees to relocate tree blocks in reference - * counted roots. There is one reloc tree for each subvol, and all - * reloc trees share same root key objectid. Reloc trees are snapshots - * of the latest committed roots of subvols (root->commit_root). - * - * To relocate a tree block referenced by a subvol, there are two steps. - * COW the block through subvol's reloc tree, then update block pointer - * in the subvol to point to the new block. Since all reloc trees share - * same root key objectid, doing special handing for tree blocks owned - * by them is easy. Once a tree block has been COWed in one reloc tree, - * we can use the resulting new block directly when the same block is - * required to COW again through other reloc trees. By this way, relocated - * tree blocks are shared between reloc trees, so they are also shared - * between subvols. - */ -static noinline int relocate_one_path(struct btrfs_trans_handle *trans, -				      struct btrfs_root *root, -				      struct btrfs_path *path, -				      struct btrfs_key *first_key, -				      struct btrfs_ref_path *ref_path, -				      struct btrfs_block_group_cache *group, -				      struct inode *reloc_inode) -{ -	struct btrfs_root *reloc_root; -	struct extent_buffer *eb = NULL; -	struct btrfs_key *keys; -	u64 *nodes; -	int level; -	int shared_level; -	int lowest_level = 0; -	int ret; - -	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) -		lowest_level = ref_path->owner_objectid; - -	if (!root->ref_cows) { -		path->lowest_level = lowest_level; -		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); -		BUG_ON(ret < 0); -		path->lowest_level = 0; -		btrfs_release_path(root, path); -		return 0; -	} - -	mutex_lock(&root->fs_info->tree_reloc_mutex); -	ret = init_reloc_tree(trans, root); -	BUG_ON(ret); -	reloc_root = root->reloc_root; - -	shared_level = ref_path->shared_level; -	ref_path->shared_level = BTRFS_MAX_LEVEL - 1; - -	keys = ref_path->node_keys; -	nodes = ref_path->new_nodes; -	memset(&keys[shared_level + 1], 0, -	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); -	memset(&nodes[shared_level + 1], 0, -	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); - -	if (nodes[lowest_level] == 0) { -		path->lowest_level = lowest_level; -		ret = btrfs_search_slot(trans, reloc_root, first_key, path, -					0, 1); -		BUG_ON(ret); -		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { -			eb = path->nodes[level]; -			if (!eb || eb == reloc_root->node) -				break; -			nodes[level] = eb->start; -			if (level == 0) -				btrfs_item_key_to_cpu(eb, &keys[level], 0); -			else -				btrfs_node_key_to_cpu(eb, &keys[level], 0); -		} -		if (nodes[0] && -		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { -			eb = path->nodes[0]; -			ret = replace_extents_in_leaf(trans, reloc_root, eb, -						      group, reloc_inode); -			BUG_ON(ret); -		} -		btrfs_release_path(reloc_root, path); -	} else { -		ret = btrfs_merge_path(trans, reloc_root, keys, nodes, -				       lowest_level); -		BUG_ON(ret); -	} +	u64 num_devices; +	u64 stripped;  	/* -	 * replace tree blocks in the fs tree with tree blocks in -	 * the reloc tree. +	 * if restripe for this chunk_type is on pick target profile and +	 * return, otherwise do the usual balance  	 */ -	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); -	BUG_ON(ret < 0); - -	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { -		ret = btrfs_search_slot(trans, reloc_root, first_key, path, -					0, 0); -		BUG_ON(ret); -		extent_buffer_get(path->nodes[0]); -		eb = path->nodes[0]; -		btrfs_release_path(reloc_root, path); -		ret = invalidate_extent_cache(reloc_root, eb, group, root); -		BUG_ON(ret); -		free_extent_buffer(eb); -	} - -	mutex_unlock(&root->fs_info->tree_reloc_mutex); -	path->lowest_level = 0; -	return 0; -} - -static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, -					struct btrfs_root *root, -					struct btrfs_path *path, -					struct btrfs_key *first_key, -					struct btrfs_ref_path *ref_path) -{ -	int ret; - -	ret = relocate_one_path(trans, root, path, first_key, -				ref_path, NULL, NULL); -	BUG_ON(ret); - -	return 0; -} - -static noinline int del_extent_zero(struct btrfs_trans_handle *trans, -				    struct btrfs_root *extent_root, -				    struct btrfs_path *path, -				    struct btrfs_key *extent_key) -{ -	int ret; - -	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); -	if (ret) -		goto out; -	ret = btrfs_del_item(trans, extent_root, path); -out: -	btrfs_release_path(extent_root, path); -	return ret; -} - -static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, -						struct btrfs_ref_path *ref_path) -{ -	struct btrfs_key root_key; - -	root_key.objectid = ref_path->root_objectid; -	root_key.type = BTRFS_ROOT_ITEM_KEY; -	if (is_cowonly_root(ref_path->root_objectid)) -		root_key.offset = 0; -	else -		root_key.offset = (u64)-1; - -	return btrfs_read_fs_root_no_name(fs_info, &root_key); -} - -static noinline int relocate_one_extent(struct btrfs_root *extent_root, -					struct btrfs_path *path, -					struct btrfs_key *extent_key, -					struct btrfs_block_group_cache *group, -					struct inode *reloc_inode, int pass) -{ -	struct btrfs_trans_handle *trans; -	struct btrfs_root *found_root; -	struct btrfs_ref_path *ref_path = NULL; -	struct disk_extent *new_extents = NULL; -	int nr_extents = 0; -	int loops; -	int ret; -	int level; -	struct btrfs_key first_key; -	u64 prev_block = 0; - - -	trans = btrfs_start_transaction(extent_root, 1); -	BUG_ON(!trans); - -	if (extent_key->objectid == 0) { -		ret = del_extent_zero(trans, extent_root, path, extent_key); -		goto out; -	} +	stripped = get_restripe_target(root->fs_info, flags); +	if (stripped) +		return extended_to_chunk(stripped); -	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); -	if (!ref_path) { -		ret = -ENOMEM; -		goto out; -	} - -	for (loops = 0; ; loops++) { -		if (loops == 0) { -			ret = btrfs_first_ref_path(trans, extent_root, ref_path, -						   extent_key->objectid); -		} else { -			ret = btrfs_next_ref_path(trans, extent_root, ref_path); -		} -		if (ret < 0) -			goto out; -		if (ret > 0) -			break; - -		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || -		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) -			continue; - -		found_root = read_ref_root(extent_root->fs_info, ref_path); -		BUG_ON(!found_root); -		/* -		 * for reference counted tree, only process reference paths -		 * rooted at the latest committed root. -		 */ -		if (found_root->ref_cows && -		    ref_path->root_generation != found_root->root_key.offset) -			continue; - -		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { -			if (pass == 0) { -				/* -				 * copy data extents to new locations -				 */ -				u64 group_start = group->key.objectid; -				ret = relocate_data_extent(reloc_inode, -							   extent_key, -							   group_start); -				if (ret < 0) -					goto out; -				break; -			} -			level = 0; -		} else { -			level = ref_path->owner_objectid; -		} - -		if (prev_block != ref_path->nodes[level]) { -			struct extent_buffer *eb; -			u64 block_start = ref_path->nodes[level]; -			u64 block_size = btrfs_level_size(found_root, level); - -			eb = read_tree_block(found_root, block_start, -					     block_size, 0); -			btrfs_tree_lock(eb); -			BUG_ON(level != btrfs_header_level(eb)); - -			if (level == 0) -				btrfs_item_key_to_cpu(eb, &first_key, 0); -			else -				btrfs_node_key_to_cpu(eb, &first_key, 0); - -			btrfs_tree_unlock(eb); -			free_extent_buffer(eb); -			prev_block = block_start; -		} - -		mutex_lock(&extent_root->fs_info->trans_mutex); -		btrfs_record_root_in_trans(found_root); -		mutex_unlock(&extent_root->fs_info->trans_mutex); -		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { -			/* -			 * try to update data extent references while -			 * keeping metadata shared between snapshots. -			 */ -			if (pass == 1) { -				ret = relocate_one_path(trans, found_root, -						path, &first_key, ref_path, -						group, reloc_inode); -				if (ret < 0) -					goto out; -				continue; -			} -			/* -			 * use fallback method to process the remaining -			 * references. -			 */ -			if (!new_extents) { -				u64 group_start = group->key.objectid; -				new_extents = kmalloc(sizeof(*new_extents), -						      GFP_NOFS); -				nr_extents = 1; -				ret = get_new_locations(reloc_inode, -							extent_key, -							group_start, 1, -							&new_extents, -							&nr_extents); -				if (ret) -					goto out; -			} -			ret = replace_one_extent(trans, found_root, -						path, extent_key, -						&first_key, ref_path, -						new_extents, nr_extents); -		} else { -			ret = relocate_tree_block(trans, found_root, path, -						  &first_key, ref_path); -		} -		if (ret < 0) -			goto out; -	} -	ret = 0; -out: -	btrfs_end_transaction(trans, extent_root); -	kfree(new_extents); -	kfree(ref_path); -	return ret; -} -#endif +	/* +	 * we add in the count of missing devices because we want +	 * to make sure that any RAID levels on a degraded FS +	 * continue to be honored. +	 */ +	num_devices = root->fs_info->fs_devices->rw_devices + +		root->fs_info->fs_devices->missing_devices; -static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) -{ -	u64 num_devices; -	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | +	stripped = BTRFS_BLOCK_GROUP_RAID0 | +		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |  		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; -	num_devices = root->fs_info->fs_devices->rw_devices;  	if (num_devices == 1) {  		stripped |= BTRFS_BLOCK_GROUP_DUP;  		stripped = flags & ~stripped; @@ -7891,7 +8205,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |  			     BTRFS_BLOCK_GROUP_RAID10))  			return stripped | BTRFS_BLOCK_GROUP_DUP; -		return flags;  	} else {  		/* they already had raid on here, just return */  		if (flags & stripped) @@ -7904,35 +8217,51 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)  		if (flags & BTRFS_BLOCK_GROUP_DUP)  			return stripped | BTRFS_BLOCK_GROUP_RAID1; -		/* turn single device chunks into raid0 */ -		return stripped | BTRFS_BLOCK_GROUP_RAID0; +		/* this is drive concat, leave it alone */  	} +  	return flags;  } -static int set_block_group_ro(struct btrfs_block_group_cache *cache) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)  {  	struct btrfs_space_info *sinfo = cache->space_info;  	u64 num_bytes; +	u64 min_allocable_bytes;  	int ret = -ENOSPC; -	if (cache->ro) -		return 0; + +	/* +	 * We need some metadata space and system metadata space for +	 * allocating chunks in some corner cases until we force to set +	 * it to be readonly. +	 */ +	if ((sinfo->flags & +	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && +	    !force) +		min_allocable_bytes = 1 * 1024 * 1024; +	else +		min_allocable_bytes = 0;  	spin_lock(&sinfo->lock);  	spin_lock(&cache->lock); + +	if (cache->ro) { +		ret = 0; +		goto out; +	} +  	num_bytes = cache->key.offset - cache->reserved - cache->pinned -  		    cache->bytes_super - btrfs_block_group_used(&cache->item);  	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + -	    sinfo->bytes_may_use + sinfo->bytes_readonly + -	    cache->reserved_pinned + num_bytes < sinfo->total_bytes) { +	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + +	    min_allocable_bytes <= sinfo->total_bytes) {  		sinfo->bytes_readonly += num_bytes; -		sinfo->bytes_reserved += cache->reserved_pinned; -		cache->reserved_pinned = 0;  		cache->ro = 1;  		ret = 0;  	} +out:  	spin_unlock(&cache->lock);  	spin_unlock(&sinfo->lock);  	return ret; @@ -7948,27 +8277,97 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,  	BUG_ON(cache->ro); -	trans = btrfs_join_transaction(root, 1); -	BUG_ON(IS_ERR(trans)); +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) +		return PTR_ERR(trans);  	alloc_flags = update_block_group_flags(root, cache->flags); -	if (alloc_flags != cache->flags) -		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +	if (alloc_flags != cache->flags) { +		ret = do_chunk_alloc(trans, root, alloc_flags, +				     CHUNK_ALLOC_FORCE); +		if (ret < 0) +			goto out; +	} -	ret = set_block_group_ro(cache); +	ret = set_block_group_ro(cache, 0);  	if (!ret)  		goto out;  	alloc_flags = get_alloc_profile(root, cache->space_info->flags); -	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +	ret = do_chunk_alloc(trans, root, alloc_flags, +			     CHUNK_ALLOC_FORCE);  	if (ret < 0)  		goto out; -	ret = set_block_group_ro(cache); +	ret = set_block_group_ro(cache, 0);  out:  	btrfs_end_transaction(trans, root);  	return ret;  } -int btrfs_set_block_group_rw(struct btrfs_root *root, +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, +			    struct btrfs_root *root, u64 type) +{ +	u64 alloc_flags = get_alloc_profile(root, type); +	return do_chunk_alloc(trans, root, alloc_flags, +			      CHUNK_ALLOC_FORCE); +} + +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ +	struct btrfs_block_group_cache *block_group; +	u64 free_bytes = 0; +	int factor; + +	list_for_each_entry(block_group, groups_list, list) { +		spin_lock(&block_group->lock); + +		if (!block_group->ro) { +			spin_unlock(&block_group->lock); +			continue; +		} + +		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | +					  BTRFS_BLOCK_GROUP_RAID10 | +					  BTRFS_BLOCK_GROUP_DUP)) +			factor = 2; +		else +			factor = 1; + +		free_bytes += (block_group->key.offset - +			       btrfs_block_group_used(&block_group->item)) * +			       factor; + +		spin_unlock(&block_group->lock); +	} + +	return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ +	int i; +	u64 free_bytes = 0; + +	spin_lock(&sinfo->lock); + +	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) +		if (!list_empty(&sinfo->block_groups[i])) +			free_bytes += __btrfs_get_ro_block_group_free_space( +						&sinfo->block_groups[i]); + +	spin_unlock(&sinfo->lock); + +	return free_bytes; +} + +void btrfs_set_block_group_rw(struct btrfs_root *root,  			      struct btrfs_block_group_cache *cache)  {  	struct btrfs_space_info *sinfo = cache->space_info; @@ -7984,7 +8383,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,  	cache->ro = 0;  	spin_unlock(&cache->lock);  	spin_unlock(&sinfo->lock); -	return 0;  }  /* @@ -7999,6 +8397,12 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  	struct btrfs_space_info *space_info;  	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;  	struct btrfs_device *device; +	struct btrfs_trans_handle *trans; +	u64 min_free; +	u64 dev_min = 1; +	u64 dev_nr = 0; +	u64 target; +	int index;  	int full = 0;  	int ret = 0; @@ -8008,8 +8412,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  	if (!block_group)  		return -1; +	min_free = btrfs_block_group_used(&block_group->item); +  	/* no bytes used, we're good */ -	if (!btrfs_block_group_used(&block_group->item)) +	if (!min_free)  		goto out;  	space_info = block_group->space_info; @@ -8025,10 +8431,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  	 * all of the extents from this block group.  If we can, we're good  	 */  	if ((space_info->total_bytes != block_group->key.offset) && -	   (space_info->bytes_used + space_info->bytes_reserved + -	    space_info->bytes_pinned + space_info->bytes_readonly + -	    btrfs_block_group_used(&block_group->item) < -	    space_info->total_bytes)) { +	    (space_info->bytes_used + space_info->bytes_reserved + +	     space_info->bytes_pinned + space_info->bytes_readonly + +	     min_free < space_info->total_bytes)) {  		spin_unlock(&space_info->lock);  		goto out;  	} @@ -8037,32 +8442,78 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  	/*  	 * ok we don't have enough space, but maybe we have free space on our  	 * devices to allocate new chunks for relocation, so loop through our -	 * alloc devices and guess if we have enough space.  However, if we -	 * were marked as full, then we know there aren't enough chunks, and we -	 * can just return. +	 * alloc devices and guess if we have enough space.  if this block +	 * group is going to be restriped, run checks against the target +	 * profile instead of the current one.  	 */  	ret = -1; -	if (full) + +	/* +	 * index: +	 *      0: raid10 +	 *      1: raid1 +	 *      2: dup +	 *      3: raid0 +	 *      4: single +	 */ +	target = get_restripe_target(root->fs_info, block_group->flags); +	if (target) { +		index = __get_raid_index(extended_to_chunk(target)); +	} else { +		/* +		 * this is just a balance, so if we were marked as full +		 * we know there is no space for a new chunk +		 */ +		if (full) +			goto out; + +		index = get_block_group_index(block_group); +	} + +	if (index == BTRFS_RAID_RAID10) { +		dev_min = 4; +		/* Divide by 2 */ +		min_free >>= 1; +	} else if (index == BTRFS_RAID_RAID1) { +		dev_min = 2; +	} else if (index == BTRFS_RAID_DUP) { +		/* Multiply by 2 */ +		min_free <<= 1; +	} else if (index == BTRFS_RAID_RAID0) { +		dev_min = fs_devices->rw_devices; +		do_div(min_free, dev_min); +	} + +	/* We need to do this so that we can look at pending chunks */ +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans);  		goto out; +	}  	mutex_lock(&root->fs_info->chunk_mutex);  	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { -		u64 min_free = btrfs_block_group_used(&block_group->item); -		u64 dev_offset, max_avail; +		u64 dev_offset;  		/*  		 * check to make sure we can actually find a chunk with enough  		 * space to fit our block group in.  		 */ -		if (device->total_bytes > device->bytes_used + min_free) { -			ret = find_free_dev_extent(NULL, device, min_free, -						   &dev_offset, &max_avail); +		if (device->total_bytes > device->bytes_used + min_free && +		    !device->is_tgtdev_for_dev_replace) { +			ret = find_free_dev_extent(trans, device, min_free, +						   &dev_offset, NULL);  			if (!ret) +				dev_nr++; + +			if (dev_nr >= dev_min)  				break; +  			ret = -1;  		}  	}  	mutex_unlock(&root->fs_info->chunk_mutex); +	btrfs_end_transaction(trans, root);  out:  	btrfs_put_block_group(block_group);  	return ret; @@ -8145,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  	struct btrfs_caching_control *caching_ctl;  	struct rb_node *n; -	down_write(&info->extent_commit_sem); +	down_write(&info->commit_root_sem);  	while (!list_empty(&info->caching_block_groups)) {  		caching_ctl = list_entry(info->caching_block_groups.next,  					 struct btrfs_caching_control, list);  		list_del(&caching_ctl->list);  		put_caching_control(caching_ctl);  	} -	up_write(&info->extent_commit_sem); +	up_write(&info->commit_root_sem);  	spin_lock(&info->block_group_cache_lock);  	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8169,6 +8620,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  		if (block_group->cached == BTRFS_CACHE_STARTED)  			wait_block_group_cache_done(block_group); +		/* +		 * We haven't cached this block group, which means we could +		 * possibly have excluded extents on this block group. +		 */ +		if (block_group->cached == BTRFS_CACHE_NO || +		    block_group->cached == BTRFS_CACHE_ERROR) +			free_excluded_extents(info->extent_root, block_group); +  		btrfs_remove_free_space_cache(block_group);  		btrfs_put_block_group(block_group); @@ -8186,17 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  	release_global_block_rsv(info); -	while(!list_empty(&info->space_info)) { +	while (!list_empty(&info->space_info)) { +		int i; +  		space_info = list_entry(info->space_info.next,  					struct btrfs_space_info,  					list); -		if (space_info->bytes_pinned > 0 || -		    space_info->bytes_reserved > 0) { -			WARN_ON(1); -			dump_space_info(space_info, 0, 0); +		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { +			if (WARN_ON(space_info->bytes_pinned > 0 || +			    space_info->bytes_reserved > 0 || +			    space_info->bytes_may_use > 0)) { +				dump_space_info(space_info, 0, 0); +			}  		}  		list_del(&space_info->list); -		kfree(space_info); +		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { +			struct kobject *kobj; +			kobj = space_info->block_group_kobjs[i]; +			space_info->block_group_kobjs[i] = NULL; +			if (kobj) { +				kobject_del(kobj); +				kobject_put(kobj); +			} +		} +		kobject_del(&space_info->kobj); +		kobject_put(&space_info->kobj);  	}  	return 0;  } @@ -8205,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info,  			       struct btrfs_block_group_cache *cache)  {  	int index = get_block_group_index(cache); +	bool first = false;  	down_write(&space_info->groups_sem); +	if (list_empty(&space_info->block_groups[index])) +		first = true;  	list_add_tail(&cache->list, &space_info->block_groups[index]);  	up_write(&space_info->groups_sem); + +	if (first) { +		struct raid_kobject *rkobj; +		int ret; + +		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); +		if (!rkobj) +			goto out_err; +		rkobj->raid_type = index; +		kobject_init(&rkobj->kobj, &btrfs_raid_ktype); +		ret = kobject_add(&rkobj->kobj, &space_info->kobj, +				  "%s", get_raid_name(index)); +		if (ret) { +			kobject_put(&rkobj->kobj); +			goto out_err; +		} +		space_info->block_group_kobjs[index] = &rkobj->kobj; +	} + +	return; +out_err: +	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ +	struct btrfs_block_group_cache *cache; + +	cache = kzalloc(sizeof(*cache), GFP_NOFS); +	if (!cache) +		return NULL; + +	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), +					GFP_NOFS); +	if (!cache->free_space_ctl) { +		kfree(cache); +		return NULL; +	} + +	cache->key.objectid = start; +	cache->key.offset = size; +	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + +	cache->sectorsize = root->sectorsize; +	cache->fs_info = root->fs_info; +	cache->full_stripe_len = btrfs_full_stripe_len(root, +					       &root->fs_info->mapping_tree, +					       start); +	atomic_set(&cache->count, 1); +	spin_lock_init(&cache->lock); +	init_rwsem(&cache->data_rwsem); +	INIT_LIST_HEAD(&cache->list); +	INIT_LIST_HEAD(&cache->cluster_list); +	INIT_LIST_HEAD(&cache->new_bg_list); +	btrfs_init_free_space_ctl(cache); + +	return cache;  }  int btrfs_read_block_groups(struct btrfs_root *root) @@ -8231,15 +8765,14 @@ int btrfs_read_block_groups(struct btrfs_root *root)  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; +	path->reada = 1; -	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); -	if (cache_gen != 0 && -	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) +	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); +	if (btrfs_test_opt(root, SPACE_CACHE) && +	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)  		need_clear = 1;  	if (btrfs_test_opt(root, CLEAR_CACHE))  		need_clear = 1; -	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) -		printk(KERN_INFO "btrfs: disk space caching is enabled\n");  	while (1) {  		ret = find_first_block_group(root, path, &key); @@ -8250,39 +8783,53 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		leaf = path->nodes[0];  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		cache = kzalloc(sizeof(*cache), GFP_NOFS); + +		cache = btrfs_create_block_group_cache(root, found_key.objectid, +						       found_key.offset);  		if (!cache) {  			ret = -ENOMEM;  			goto error;  		} -		atomic_set(&cache->count, 1); -		spin_lock_init(&cache->lock); -		spin_lock_init(&cache->tree_lock); -		cache->fs_info = info; -		INIT_LIST_HEAD(&cache->list); -		INIT_LIST_HEAD(&cache->cluster_list); - -		if (need_clear) +		if (need_clear) { +			/* +			 * When we mount with old space cache, we need to +			 * set BTRFS_DC_CLEAR and set dirty flag. +			 * +			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we +			 *    truncate the old free space cache inode and +			 *    setup a new one. +			 * b) Setting 'dirty flag' makes sure that we flush +			 *    the new space cache info onto disk. +			 */  			cache->disk_cache_state = BTRFS_DC_CLEAR; - -		/* -		 * we only want to have 32k of ram per block group for keeping -		 * track of free space, and if we pass 1/2 of that we want to -		 * start converting things over to using bitmaps -		 */ -		cache->extents_thresh = ((1024 * 32) / 2) / -			sizeof(struct btrfs_free_space); +			if (btrfs_test_opt(root, SPACE_CACHE)) +				cache->dirty = 1; +		}  		read_extent_buffer(leaf, &cache->item,  				   btrfs_item_ptr_offset(leaf, path->slots[0]),  				   sizeof(cache->item)); -		memcpy(&cache->key, &found_key, sizeof(found_key)); +		cache->flags = btrfs_block_group_flags(&cache->item);  		key.objectid = found_key.objectid + found_key.offset; -		btrfs_release_path(root, path); -		cache->flags = btrfs_block_group_flags(&cache->item); -		cache->sectorsize = root->sectorsize; +		btrfs_release_path(path); + +		/* +		 * We need to exclude the super stripes now so that the space +		 * info has super bytes accounted for, otherwise we'll think +		 * we have more space than we actually do. +		 */ +		ret = exclude_super_stripes(root, cache); +		if (ret) { +			/* +			 * We may have excluded something, so call this just in +			 * case. +			 */ +			free_excluded_extents(root, cache); +			btrfs_put_block_group(cache); +			goto error; +		}  		/*  		 * check for two cases, either we are full, and therefore @@ -8292,12 +8839,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		 * time, particularly in the full case.  		 */  		if (found_key.offset == btrfs_block_group_used(&cache->item)) { -			exclude_super_stripes(root, cache);  			cache->last_byte_to_unpin = (u64)-1;  			cache->cached = BTRFS_CACHE_FINISHED;  			free_excluded_extents(root, cache);  		} else if (btrfs_block_group_used(&cache->item) == 0) { -			exclude_super_stripes(root, cache);  			cache->last_byte_to_unpin = (u64)-1;  			cache->cached = BTRFS_CACHE_FINISHED;  			add_new_free_space(cache, root->fs_info, @@ -8307,10 +8852,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)  			free_excluded_extents(root, cache);  		} +		ret = btrfs_add_block_group_cache(root->fs_info, cache); +		if (ret) { +			btrfs_remove_free_space_cache(cache); +			btrfs_put_block_group(cache); +			goto error; +		} +  		ret = update_space_info(info, cache->flags, found_key.offset,  					btrfs_block_group_used(&cache->item),  					&space_info); -		BUG_ON(ret); +		if (ret) { +			btrfs_remove_free_space_cache(cache); +			spin_lock(&info->block_group_cache_lock); +			rb_erase(&cache->cache_node, +				 &info->block_group_cache_tree); +			spin_unlock(&info->block_group_cache_lock); +			btrfs_put_block_group(cache); +			goto error; +		} +  		cache->space_info = space_info;  		spin_lock(&cache->space_info->lock);  		cache->space_info->bytes_readonly += cache->bytes_super; @@ -8318,28 +8879,31 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		__link_block_group(space_info, cache); -		ret = btrfs_add_block_group_cache(root->fs_info, cache); -		BUG_ON(ret); -  		set_avail_alloc_bits(root->fs_info, cache->flags);  		if (btrfs_chunk_readonly(root, cache->key.objectid)) -			set_block_group_ro(cache); +			set_block_group_ro(cache, 1);  	}  	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {  		if (!(get_alloc_profile(root, space_info->flags) &  		      (BTRFS_BLOCK_GROUP_RAID10 |  		       BTRFS_BLOCK_GROUP_RAID1 | +		       BTRFS_BLOCK_GROUP_RAID5 | +		       BTRFS_BLOCK_GROUP_RAID6 |  		       BTRFS_BLOCK_GROUP_DUP)))  			continue;  		/*  		 * avoid allocating from un-mirrored block group if there are  		 * mirrored block groups.  		 */ -		list_for_each_entry(cache, &space_info->block_groups[3], list) -			set_block_group_ro(cache); -		list_for_each_entry(cache, &space_info->block_groups[4], list) -			set_block_group_ro(cache); +		list_for_each_entry(cache, +				&space_info->block_groups[BTRFS_RAID_RAID0], +				list) +			set_block_group_ro(cache, 1); +		list_for_each_entry(cache, +				&space_info->block_groups[BTRFS_RAID_SINGLE], +				list) +			set_block_group_ro(cache, 1);  	}  	init_global_block_rsv(info); @@ -8349,6 +8913,38 @@ error:  	return ret;  } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, +				       struct btrfs_root *root) +{ +	struct btrfs_block_group_cache *block_group, *tmp; +	struct btrfs_root *extent_root = root->fs_info->extent_root; +	struct btrfs_block_group_item item; +	struct btrfs_key key; +	int ret = 0; + +	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, +				 new_bg_list) { +		list_del_init(&block_group->new_bg_list); + +		if (ret) +			continue; + +		spin_lock(&block_group->lock); +		memcpy(&item, &block_group->item, sizeof(item)); +		memcpy(&key, &block_group->key, sizeof(key)); +		spin_unlock(&block_group->lock); + +		ret = btrfs_insert_item(trans, extent_root, &key, &item, +					sizeof(item)); +		if (ret) +			btrfs_abort_transaction(trans, extent_root, ret); +		ret = btrfs_finish_chunk_alloc(trans, extent_root, +					       key.objectid, key.offset); +		if (ret) +			btrfs_abort_transaction(trans, extent_root, ret); +	} +} +  int btrfs_make_block_group(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, u64 bytes_used,  			   u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -8360,48 +8956,54 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	extent_root = root->fs_info->extent_root; -	root->fs_info->last_trans_log_full_commit = trans->transid; +	btrfs_set_log_full_commit(root->fs_info, trans); -	cache = kzalloc(sizeof(*cache), GFP_NOFS); +	cache = btrfs_create_block_group_cache(root, chunk_offset, size);  	if (!cache)  		return -ENOMEM; -	cache->key.objectid = chunk_offset; -	cache->key.offset = size; -	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; -	cache->sectorsize = root->sectorsize; -	cache->fs_info = root->fs_info; - -	/* -	 * we only want to have 32k of ram per block group for keeping track -	 * of free space, and if we pass 1/2 of that we want to start -	 * converting things over to using bitmaps -	 */ -	cache->extents_thresh = ((1024 * 32) / 2) / -		sizeof(struct btrfs_free_space); -	atomic_set(&cache->count, 1); -	spin_lock_init(&cache->lock); -	spin_lock_init(&cache->tree_lock); -	INIT_LIST_HEAD(&cache->list); -	INIT_LIST_HEAD(&cache->cluster_list); -  	btrfs_set_block_group_used(&cache->item, bytes_used);  	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); -	cache->flags = type;  	btrfs_set_block_group_flags(&cache->item, type); +	cache->flags = type;  	cache->last_byte_to_unpin = (u64)-1;  	cache->cached = BTRFS_CACHE_FINISHED; -	exclude_super_stripes(root, cache); +	ret = exclude_super_stripes(root, cache); +	if (ret) { +		/* +		 * We may have excluded something, so call this just in +		 * case. +		 */ +		free_excluded_extents(root, cache); +		btrfs_put_block_group(cache); +		return ret; +	}  	add_new_free_space(cache, root->fs_info, chunk_offset,  			   chunk_offset + size);  	free_excluded_extents(root, cache); +	ret = btrfs_add_block_group_cache(root->fs_info, cache); +	if (ret) { +		btrfs_remove_free_space_cache(cache); +		btrfs_put_block_group(cache); +		return ret; +	} +  	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,  				&cache->space_info); -	BUG_ON(ret); +	if (ret) { +		btrfs_remove_free_space_cache(cache); +		spin_lock(&root->fs_info->block_group_cache_lock); +		rb_erase(&cache->cache_node, +			 &root->fs_info->block_group_cache_tree); +		spin_unlock(&root->fs_info->block_group_cache_lock); +		btrfs_put_block_group(cache); +		return ret; +	} +	update_global_block_rsv(root->fs_info);  	spin_lock(&cache->space_info->lock);  	cache->space_info->bytes_readonly += cache->bytes_super; @@ -8409,18 +9011,28 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	__link_block_group(cache->space_info, cache); -	ret = btrfs_add_block_group_cache(root->fs_info, cache); -	BUG_ON(ret); - -	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, -				sizeof(cache->item)); -	BUG_ON(ret); +	list_add_tail(&cache->new_bg_list, &trans->new_bgs);  	set_avail_alloc_bits(extent_root->fs_info, type);  	return 0;  } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ +	u64 extra_flags = chunk_to_extended(flags) & +				BTRFS_EXTENDED_PROFILE_MASK; + +	write_seqlock(&fs_info->profiles_lock); +	if (flags & BTRFS_BLOCK_GROUP_DATA) +		fs_info->avail_data_alloc_bits &= ~extra_flags; +	if (flags & BTRFS_BLOCK_GROUP_METADATA) +		fs_info->avail_metadata_alloc_bits &= ~extra_flags; +	if (flags & BTRFS_BLOCK_GROUP_SYSTEM) +		fs_info->avail_system_alloc_bits &= ~extra_flags; +	write_sequnlock(&fs_info->profiles_lock); +} +  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root, u64 group_start)  { @@ -8430,7 +9042,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	struct btrfs_root *tree_root = root->fs_info->tree_root;  	struct btrfs_key key;  	struct inode *inode; +	struct kobject *kobj = NULL;  	int ret; +	int index;  	int factor;  	root = root->fs_info->extent_root; @@ -8439,7 +9053,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	BUG_ON(!block_group);  	BUG_ON(!block_group->ro); +	/* +	 * Free the reserved super bytes from this block group before +	 * remove it. +	 */ +	free_excluded_extents(root, block_group); +  	memcpy(&key, &block_group->key, sizeof(key)); +	index = get_block_group_index(block_group);  	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |  				  BTRFS_BLOCK_GROUP_RAID1 |  				  BTRFS_BLOCK_GROUP_RAID10)) @@ -8463,11 +9084,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	spin_unlock(&cluster->refill_lock);  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} -	inode = lookup_free_space_inode(root, block_group, path); +	inode = lookup_free_space_inode(tree_root, block_group, path);  	if (!IS_ERR(inode)) { -		btrfs_orphan_add(trans, inode); +		ret = btrfs_orphan_add(trans, inode); +		if (ret) { +			btrfs_add_delayed_iput(inode); +			goto out; +		}  		clear_nlink(inode);  		/* One for the block groups ref */  		spin_lock(&block_group->lock); @@ -8480,7 +9108,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  			spin_unlock(&block_group->lock);  		}  		/* One for our lookup ref */ -		iput(inode); +		btrfs_add_delayed_iput(inode);  	}  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -8491,17 +9119,20 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	if (ret < 0)  		goto out;  	if (ret > 0) -		btrfs_release_path(tree_root, path); +		btrfs_release_path(path);  	if (ret == 0) {  		ret = btrfs_del_item(trans, tree_root, path);  		if (ret)  			goto out; -		btrfs_release_path(tree_root, path); +		btrfs_release_path(path);  	}  	spin_lock(&root->fs_info->block_group_cache_lock);  	rb_erase(&block_group->cache_node,  		 &root->fs_info->block_group_cache_tree); + +	if (root->fs_info->first_logical_byte == block_group->key.objectid) +		root->fs_info->first_logical_byte = (u64)-1;  	spin_unlock(&root->fs_info->block_group_cache_lock);  	down_write(&block_group->space_info->groups_sem); @@ -8510,7 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	 * are still on the list after taking the semaphore  	 */  	list_del_init(&block_group->list); +	if (list_empty(&block_group->space_info->block_groups[index])) { +		kobj = block_group->space_info->block_group_kobjs[index]; +		block_group->space_info->block_group_kobjs[index] = NULL; +		clear_avail_alloc_bits(root->fs_info, block_group->flags); +	}  	up_write(&block_group->space_info->groups_sem); +	if (kobj) { +		kobject_del(kobj); +		kobject_put(kobj); +	}  	if (block_group->cached == BTRFS_CACHE_STARTED)  		wait_block_group_cache_done(block_group); @@ -8541,3 +9181,149 @@ out:  	btrfs_free_path(path);  	return ret;  } + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_space_info *space_info; +	struct btrfs_super_block *disk_super; +	u64 features; +	u64 flags; +	int mixed = 0; +	int ret; + +	disk_super = fs_info->super_copy; +	if (!btrfs_super_root(disk_super)) +		return 1; + +	features = btrfs_super_incompat_flags(disk_super); +	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) +		mixed = 1; + +	flags = BTRFS_BLOCK_GROUP_SYSTEM; +	ret = update_space_info(fs_info, flags, 0, 0, &space_info); +	if (ret) +		goto out; + +	if (mixed) { +		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; +		ret = update_space_info(fs_info, flags, 0, 0, &space_info); +	} else { +		flags = BTRFS_BLOCK_GROUP_METADATA; +		ret = update_space_info(fs_info, flags, 0, 0, &space_info); +		if (ret) +			goto out; + +		flags = BTRFS_BLOCK_GROUP_DATA; +		ret = update_space_info(fs_info, flags, 0, 0, &space_info); +	} +out: +	return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ +	return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, +			       u64 num_bytes, u64 *actual_bytes) +{ +	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +{ +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct btrfs_block_group_cache *cache = NULL; +	u64 group_trimmed; +	u64 start; +	u64 end; +	u64 trimmed = 0; +	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); +	int ret = 0; + +	/* +	 * try to trim all FS space, our block group may start from non-zero. +	 */ +	if (range->len == total_bytes) +		cache = btrfs_lookup_first_block_group(fs_info, range->start); +	else +		cache = btrfs_lookup_block_group(fs_info, range->start); + +	while (cache) { +		if (cache->key.objectid >= (range->start + range->len)) { +			btrfs_put_block_group(cache); +			break; +		} + +		start = max(range->start, cache->key.objectid); +		end = min(range->start + range->len, +				cache->key.objectid + cache->key.offset); + +		if (end - start >= range->minlen) { +			if (!block_group_cache_done(cache)) { +				ret = cache_block_group(cache, 0); +				if (ret) { +					btrfs_put_block_group(cache); +					break; +				} +				ret = wait_block_group_cache_done(cache); +				if (ret) { +					btrfs_put_block_group(cache); +					break; +				} +			} +			ret = btrfs_trim_block_group(cache, +						     &group_trimmed, +						     start, +						     end, +						     range->minlen); + +			trimmed += group_trimmed; +			if (ret) { +				btrfs_put_block_group(cache); +				break; +			} +		} + +		cache = next_block_group(fs_info->tree_root, cache); +	} + +	range->len = trimmed; +	return ret; +} + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ +	percpu_counter_dec(&root->subv_writers->counter); +	/* +	 * Make sure counter is updated before we wake up +	 * waiters. +	 */ +	smp_mb(); +	if (waitqueue_active(&root->subv_writers->wait)) +		wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ +	if (unlikely(atomic_read(&root->will_be_snapshoted))) +		return 0; + +	percpu_counter_inc(&root->subv_writers->counter); +	/* +	 * Make sure counter is updated before we check for snapshot creation. +	 */ +	smp_mb(); +	if (unlikely(atomic_read(&root->will_be_snapshoted))) { +		btrfs_end_nocow_write(root); +		return 0; +	} +	return 1; +}  | 
