diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/async-thread.c | 4 | ||||
| -rw-r--r-- | fs/btrfs/ctree.c | 121 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 27 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 15 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 517 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 1003 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.h | 8 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/print-tree.c | 6 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 56 | ||||
| -rw-r--r-- | fs/btrfs/transaction.h | 1 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 46 | 
14 files changed, 1359 insertions, 452 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 6e4f6c50a12..019e8af449a 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)  	 * list  	 */  	if (worker->idle) { -		spin_lock_irqsave(&worker->workers->lock, flags); +		spin_lock(&worker->workers->lock);  		worker->idle = 0;  		list_move_tail(&worker->worker_list,  			       &worker->workers->worker_list); -		spin_unlock_irqrestore(&worker->workers->lock, flags); +		spin_unlock(&worker->workers->lock);  	}  	if (!worker->working) {  		wake = 1; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60a45f3a4e9..3fdcc0512d3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)  	btrfs_disk_key_to_cpu(&k1, disk); -	if (k1.objectid > k2->objectid) -		return 1; -	if (k1.objectid < k2->objectid) -		return -1; -	if (k1.type > k2->type) -		return 1; -	if (k1.type < k2->type) -		return -1; -	if (k1.offset > k2->offset) -		return 1; -	if (k1.offset < k2->offset) -		return -1; -	return 0; +	return btrfs_comp_cpu_keys(&k1, k2);  }  /* @@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)  		return 0; -	if (btrfs_header_nritems(mid) > 2) -		return 0; -  	if (btrfs_header_nritems(mid) < 2)  		err_on_enospc = 1; @@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  	struct extent_buffer *b;  	int slot;  	int ret; +	int err;  	int level;  	int lowest_unlock = 1;  	u8 lowest_level = 0; @@ -1737,8 +1723,6 @@ again:  			p->locks[level] = 1;  		if (cow) { -			int wret; -  			/*  			 * if we don't really need to cow this block  			 * then we don't want to set the path blocking, @@ -1749,12 +1733,12 @@ again:  			btrfs_set_path_blocking(p); -			wret = btrfs_cow_block(trans, root, b, -					       p->nodes[level + 1], -					       p->slots[level + 1], &b); -			if (wret) { +			err = btrfs_cow_block(trans, root, b, +					      p->nodes[level + 1], +					      p->slots[level + 1], &b); +			if (err) {  				free_extent_buffer(b); -				ret = wret; +				ret = err;  				goto done;  			}  		} @@ -1793,41 +1777,45 @@ cow_done:  		ret = bin_search(b, key, level, &slot);  		if (level != 0) { -			if (ret && slot > 0) +			int dec = 0; +			if (ret && slot > 0) { +				dec = 1;  				slot -= 1; +			}  			p->slots[level] = slot; -			ret = setup_nodes_for_search(trans, root, p, b, level, +			err = setup_nodes_for_search(trans, root, p, b, level,  						     ins_len); -			if (ret == -EAGAIN) +			if (err == -EAGAIN)  				goto again; -			else if (ret) +			if (err) { +				ret = err;  				goto done; +			}  			b = p->nodes[level];  			slot = p->slots[level];  			unlock_up(p, level, lowest_unlock); -			/* this is only true while dropping a snapshot */  			if (level == lowest_level) { -				ret = 0; +				if (dec) +					p->slots[level]++;  				goto done;  			} -			ret = read_block_for_search(trans, root, p, +			err = read_block_for_search(trans, root, p,  						    &b, level, slot, key); -			if (ret == -EAGAIN) +			if (err == -EAGAIN)  				goto again; - -			if (ret == -EIO) +			if (err) { +				ret = err;  				goto done; +			}  			if (!p->skip_locking) { -				int lret; -  				btrfs_clear_path_blocking(p, NULL); -				lret = btrfs_try_spin_lock(b); +				err = btrfs_try_spin_lock(b); -				if (!lret) { +				if (!err) {  					btrfs_set_path_blocking(p);  					btrfs_tree_lock(b);  					btrfs_clear_path_blocking(p, b); @@ -1837,16 +1825,14 @@ cow_done:  			p->slots[level] = slot;  			if (ins_len > 0 &&  			    btrfs_leaf_free_space(root, b) < ins_len) { -				int sret; -  				btrfs_set_path_blocking(p); -				sret = split_leaf(trans, root, key, -						      p, ins_len, ret == 0); +				err = split_leaf(trans, root, key, +						 p, ins_len, ret == 0);  				btrfs_clear_path_blocking(p, NULL); -				BUG_ON(sret > 0); -				if (sret) { -					ret = sret; +				BUG_ON(err > 0); +				if (err) { +					ret = err;  					goto done;  				}  			} @@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		}  		/* delete the leaf if it is mostly empty */ -		if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { +		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {  			/* push_leaf_left fixes the path.  			 * make sure the path still points to our leaf  			 * for possible call to del_ptr below @@ -4042,10 +4028,9 @@ out:   * calling this function.   */  int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, -			struct btrfs_key *key, int lowest_level, +			struct btrfs_key *key, int level,  			int cache_only, u64 min_trans)  { -	int level = lowest_level;  	int slot;  	struct extent_buffer *c; @@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,  		c = path->nodes[level];  next:  		if (slot >= btrfs_header_nritems(c)) { -			level++; -			if (level == BTRFS_MAX_LEVEL) +			int ret; +			int orig_lowest; +			struct btrfs_key cur_key; +			if (level + 1 >= BTRFS_MAX_LEVEL || +			    !path->nodes[level + 1])  				return 1; -			continue; + +			if (path->locks[level + 1]) { +				level++; +				continue; +			} + +			slot = btrfs_header_nritems(c) - 1; +			if (level == 0) +				btrfs_item_key_to_cpu(c, &cur_key, slot); +			else +				btrfs_node_key_to_cpu(c, &cur_key, slot); + +			orig_lowest = path->lowest_level; +			btrfs_release_path(root, path); +			path->lowest_level = level; +			ret = btrfs_search_slot(NULL, root, &cur_key, path, +						0, 0); +			path->lowest_level = orig_lowest; +			if (ret < 0) +				return ret; + +			c = path->nodes[level]; +			slot = path->slots[level]; +			if (ret == 0) +				slot++; +			goto next;  		} +  		if (level == 0)  			btrfs_item_key_to_cpu(c, key, slot);  		else { @@ -4146,7 +4160,8 @@ again:  	 * advance the path if there are now more items available.  	 */  	if (nritems > 0 && path->slots[0] < nritems - 1) { -		path->slots[0]++; +		if (ret == 0) +			path->slots[0]++;  		ret = 0;  		goto done;  	} @@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,  			path->slots[0]--;  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.type == type) -			return 0;  		if (found_key.objectid < min_objectid)  			break; +		if (found_key.type == type) +			return 0;  		if (found_key.objectid == min_objectid &&  		    found_key.type < type)  			break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a87383871..837435ce84c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {  struct btrfs_extent_inline_ref {  	u8 type; -	u64 offset; +	__le64 offset;  } __attribute__ ((__packed__));  /* old style backrefs item */ @@ -689,6 +689,7 @@ struct btrfs_space_info {  	struct list_head block_groups;  	spinlock_t lock;  	struct rw_semaphore groups_sem; +	atomic_t caching_threads;  };  /* @@ -707,6 +708,9 @@ struct btrfs_free_cluster {  	/* first extent starting offset */  	u64 window_start; +	/* if this cluster simply points at a bitmap in the block group */ +	bool points_to_bitmap; +  	struct btrfs_block_group_cache *block_group;  	/*  	 * when a cluster is allocated from a block group, we put the @@ -716,24 +720,37 @@ struct btrfs_free_cluster {  	struct list_head block_group_list;  }; +enum btrfs_caching_type { +	BTRFS_CACHE_NO		= 0, +	BTRFS_CACHE_STARTED	= 1, +	BTRFS_CACHE_FINISHED	= 2, +}; +  struct btrfs_block_group_cache {  	struct btrfs_key key;  	struct btrfs_block_group_item item; +	struct btrfs_fs_info *fs_info;  	spinlock_t lock; -	struct mutex cache_mutex;  	u64 pinned;  	u64 reserved;  	u64 flags; -	int cached; +	u64 sectorsize; +	int extents_thresh; +	int free_extents; +	int total_bitmaps;  	int ro;  	int dirty; +	/* cache tracking stuff */ +	wait_queue_head_t caching_q; +	int cached; +  	struct btrfs_space_info *space_info;  	/* free space cache stuff */  	spinlock_t tree_lock; -	struct rb_root free_space_bytes;  	struct rb_root free_space_offset; +	u64 free_space;  	/* block group cache stuff */  	struct rb_node cache_node; @@ -808,6 +825,7 @@ struct btrfs_fs_info {  	struct mutex drop_mutex;  	struct mutex volume_mutex;  	struct mutex tree_reloc_mutex; +	struct rw_semaphore extent_commit_sem;  	/*  	 * this protects the ordered operations list only while we are @@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,  				 u64 bytes);  void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,  			      u64 bytes); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info);  /* ctree.c */  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,  		     int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c95f7..e83be2e4602 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1639,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,  	mutex_init(&fs_info->cleaner_mutex);  	mutex_init(&fs_info->volume_mutex);  	mutex_init(&fs_info->tree_reloc_mutex); +	init_rwsem(&fs_info->extent_commit_sem);  	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);  	btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,  					   btrfs_super_chunk_root(disk_super),  					   blocksize, generation);  	BUG_ON(!chunk_root->node); +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { +		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", +		       sb->s_id); +		goto fail_chunk_root; +	}  	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);  	chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,  					  blocksize, generation);  	if (!tree_root->node)  		goto fail_chunk_root; +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { +		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", +		       sb->s_id); +		goto fail_tree_root; +	}  	btrfs_set_root_node(&tree_root->root_item, tree_root->node);  	tree_root->commit_root = btrfs_root_node(tree_root); @@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)  			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);  	} +	fs_info->closing = 2; +	smp_mb(); +  	if (fs_info->delalloc_bytes) {  		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",  		       (unsigned long long)fs_info->delalloc_bytes); @@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)  	free_extent_buffer(root->fs_info->csum_root->commit_root);  	btrfs_free_block_groups(root->fs_info); +	btrfs_free_pinned_extents(root->fs_info);  	del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5aca3997d4..dc84daee6bc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -21,6 +21,7 @@  #include <linux/blkdev.h>  #include <linux/sort.h>  #include <linux/rcupdate.h> +#include <linux/kthread.h>  #include "compat.h"  #include "hash.h"  #include "ctree.h" @@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,  			  struct btrfs_root *extent_root, u64 alloc_bytes,  			  u64 flags, int force); +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ +	smp_mb(); +	return cache->cached == BTRFS_CACHE_FINISHED; +} +  static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)  {  	return (cache->flags & bits) == bits; @@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,  }  /* + * We always set EXTENT_LOCKED for the super mirror extents so we don't + * overwrite them, so those bits need to be unset.  Also, if we are unmounting + * with pinned extents still sitting there because we had a block group caching, + * we need to clear those now, since we are done. + */ +void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +{ +	u64 start, end, last = 0; +	int ret; + +	while (1) { +		ret = find_first_extent_bit(&info->pinned_extents, last, +					    &start, &end, +					    EXTENT_LOCKED|EXTENT_DIRTY); +		if (ret) +			break; + +		clear_extent_bits(&info->pinned_extents, start, end, +				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); +		last = end+1; +	} +} + +static int remove_sb_from_cache(struct btrfs_root *root, +				struct btrfs_block_group_cache *cache) +{ +	struct btrfs_fs_info *fs_info = root->fs_info; +	u64 bytenr; +	u64 *logical; +	int stripe_len; +	int i, nr, ret; + +	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { +		bytenr = btrfs_sb_offset(i); +		ret = btrfs_rmap_block(&root->fs_info->mapping_tree, +				       cache->key.objectid, bytenr, +				       0, &logical, &nr, &stripe_len); +		BUG_ON(ret); +		while (nr--) { +			try_lock_extent(&fs_info->pinned_extents, +					logical[nr], +					logical[nr] + stripe_len - 1, GFP_NOFS); +		} +		kfree(logical); +	} + +	return 0; +} + +/*   * this is only called by cache_block_group, since we could have freed extents   * we need to check the pinned_extents for any extents that can't be used yet   * since their free space will be released as soon as the transaction commits.   */ -static int add_new_free_space(struct btrfs_block_group_cache *block_group, +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,  			      struct btrfs_fs_info *info, u64 start, u64 end)  { -	u64 extent_start, extent_end, size; +	u64 extent_start, extent_end, size, total_added = 0;  	int ret;  	while (start < end) {  		ret = find_first_extent_bit(&info->pinned_extents, start,  					    &extent_start, &extent_end, -					    EXTENT_DIRTY); +					    EXTENT_DIRTY|EXTENT_LOCKED);  		if (ret)  			break; @@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,  			start = extent_end + 1;  		} else if (extent_start > start && extent_start < end) {  			size = extent_start - start; +			total_added += size;  			ret = btrfs_add_free_space(block_group, start,  						   size);  			BUG_ON(ret); @@ -178,84 +237,80 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,  	if (start < end) {  		size = end - start; +		total_added += size;  		ret = btrfs_add_free_space(block_group, start, size);  		BUG_ON(ret);  	} -	return 0; +	return total_added;  } -static int remove_sb_from_cache(struct btrfs_root *root, -				struct btrfs_block_group_cache *cache) -{ -	u64 bytenr; -	u64 *logical; -	int stripe_len; -	int i, nr, ret; - -	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { -		bytenr = btrfs_sb_offset(i); -		ret = btrfs_rmap_block(&root->fs_info->mapping_tree, -				       cache->key.objectid, bytenr, 0, -				       &logical, &nr, &stripe_len); -		BUG_ON(ret); -		while (nr--) { -			btrfs_remove_free_space(cache, logical[nr], -						stripe_len); -		} -		kfree(logical); -	} -	return 0; -} - -static int cache_block_group(struct btrfs_root *root, -			     struct btrfs_block_group_cache *block_group) +static int caching_kthread(void *data)  { +	struct btrfs_block_group_cache *block_group = data; +	struct btrfs_fs_info *fs_info = block_group->fs_info; +	u64 last = 0;  	struct btrfs_path *path;  	int ret = 0;  	struct btrfs_key key;  	struct extent_buffer *leaf;  	int slot; -	u64 last; - -	if (!block_group) -		return 0; +	u64 total_found = 0; -	root = root->fs_info->extent_root; - -	if (block_group->cached) -		return 0; +	BUG_ON(!fs_info);  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	path->reada = 2; +	atomic_inc(&block_group->space_info->caching_threads); +	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +again: +	/* need to make sure the commit_root doesn't disappear */ +	down_read(&fs_info->extent_commit_sem); +  	/* -	 * we get into deadlocks with paths held by callers of this function. -	 * since the alloc_mutex is protecting things right now, just -	 * skip the locking here +	 * We don't want to deadlock with somebody trying to allocate a new +	 * extent for the extent root while also trying to search the extent +	 * root to add free space.  So we skip locking and search the commit +	 * root, since its read-only  	 */  	path->skip_locking = 1; -	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +	path->search_commit_root = 1; +	path->reada = 2; +  	key.objectid = last;  	key.offset = 0;  	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);  	if (ret < 0)  		goto err;  	while (1) { +		smp_mb(); +		if (block_group->fs_info->closing > 1) { +			last = (u64)-1; +			break; +		} +  		leaf = path->nodes[0];  		slot = path->slots[0];  		if (slot >= btrfs_header_nritems(leaf)) { -			ret = btrfs_next_leaf(root, path); +			ret = btrfs_next_leaf(fs_info->extent_root, path);  			if (ret < 0)  				goto err; -			if (ret == 0) -				continue; -			else +			else if (ret)  				break; + +			if (need_resched() || +			    btrfs_transaction_in_commit(fs_info)) { +				btrfs_release_path(fs_info->extent_root, path); +				up_read(&fs_info->extent_commit_sem); +				schedule_timeout(1); +				goto again; +			} + +			continue;  		}  		btrfs_item_key_to_cpu(leaf, &key, slot);  		if (key.objectid < block_group->key.objectid) @@ -266,24 +321,59 @@ static int cache_block_group(struct btrfs_root *root,  			break;  		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { -			add_new_free_space(block_group, root->fs_info, last, -					   key.objectid); - +			total_found += add_new_free_space(block_group, +							  fs_info, last, +							  key.objectid);  			last = key.objectid + key.offset;  		} + +		if (total_found > (1024 * 1024 * 2)) { +			total_found = 0; +			wake_up(&block_group->caching_q); +		}  next:  		path->slots[0]++;  	} +	ret = 0; -	add_new_free_space(block_group, root->fs_info, last, -			   block_group->key.objectid + -			   block_group->key.offset); +	total_found += add_new_free_space(block_group, fs_info, last, +					  block_group->key.objectid + +					  block_group->key.offset); + +	spin_lock(&block_group->lock); +	block_group->cached = BTRFS_CACHE_FINISHED; +	spin_unlock(&block_group->lock); -	block_group->cached = 1; -	remove_sb_from_cache(root, block_group); -	ret = 0;  err:  	btrfs_free_path(path); +	up_read(&fs_info->extent_commit_sem); +	atomic_dec(&block_group->space_info->caching_threads); +	wake_up(&block_group->caching_q); + +	return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ +	struct task_struct *tsk; +	int ret = 0; + +	spin_lock(&cache->lock); +	if (cache->cached != BTRFS_CACHE_NO) { +		spin_unlock(&cache->lock); +		return ret; +	} +	cache->cached = BTRFS_CACHE_STARTED; +	spin_unlock(&cache->lock); + +	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", +			  cache->key.objectid); +	if (IS_ERR(tsk)) { +		ret = PTR_ERR(tsk); +		printk(KERN_ERR "error running thread %d\n", ret); +		BUG(); +	} +  	return ret;  } @@ -2387,13 +2477,29 @@ fail:  } +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, +		 struct btrfs_block_group_cache *cache) +{ +	struct rb_node *node; +	spin_lock(&root->fs_info->block_group_cache_lock); +	node = rb_next(&cache->cache_node); +	btrfs_put_block_group(cache); +	if (node) { +		cache = rb_entry(node, struct btrfs_block_group_cache, +				 cache_node); +		atomic_inc(&cache->count); +	} else +		cache = NULL; +	spin_unlock(&root->fs_info->block_group_cache_lock); +	return cache; +} +  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root)  { -	struct btrfs_block_group_cache *cache, *entry; -	struct rb_node *n; +	struct btrfs_block_group_cache *cache;  	int err = 0; -	int werr = 0;  	struct btrfs_path *path;  	u64 last = 0; @@ -2402,39 +2508,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	while (1) { -		cache = NULL; -		spin_lock(&root->fs_info->block_group_cache_lock); -		for (n = rb_first(&root->fs_info->block_group_cache_tree); -		     n; n = rb_next(n)) { -			entry = rb_entry(n, struct btrfs_block_group_cache, -					 cache_node); -			if (entry->dirty) { -				cache = entry; -				break; -			} +		if (last == 0) { +			err = btrfs_run_delayed_refs(trans, root, +						     (unsigned long)-1); +			BUG_ON(err);  		} -		spin_unlock(&root->fs_info->block_group_cache_lock); -		if (!cache) -			break; +		cache = btrfs_lookup_first_block_group(root->fs_info, last); +		while (cache) { +			if (cache->dirty) +				break; +			cache = next_block_group(root, cache); +		} +		if (!cache) { +			if (last == 0) +				break; +			last = 0; +			continue; +		}  		cache->dirty = 0; -		last += cache->key.offset; +		last = cache->key.objectid + cache->key.offset; -		err = write_one_cache_group(trans, root, -					    path, cache); -		/* -		 * if we fail to write the cache group, we want -		 * to keep it marked dirty in hopes that a later -		 * write will work -		 */ -		if (err) { -			werr = err; -			continue; -		} +		err = write_one_cache_group(trans, root, path, cache); +		BUG_ON(err); +		btrfs_put_block_group(cache);  	} +  	btrfs_free_path(path); -	return werr; +	return 0;  }  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2484,6 +2586,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	found->force_alloc = 0;  	*space_info = found;  	list_add_rcu(&found->list, &info->space_info); +	atomic_set(&found->caching_threads, 0);  	return 0;  } @@ -2947,13 +3050,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,  	struct btrfs_block_group_cache *cache;  	struct btrfs_fs_info *fs_info = root->fs_info; -	if (pin) { +	if (pin)  		set_extent_dirty(&fs_info->pinned_extents,  				bytenr, bytenr + num - 1, GFP_NOFS); -	} else { -		clear_extent_dirty(&fs_info->pinned_extents, -				bytenr, bytenr + num - 1, GFP_NOFS); -	}  	while (num > 0) {  		cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2969,14 +3068,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,  			spin_unlock(&cache->space_info->lock);  			fs_info->total_pinned += len;  		} else { +			int unpin = 0; + +			/* +			 * in order to not race with the block group caching, we +			 * only want to unpin the extent if we are cached.  If +			 * we aren't cached, we want to start async caching this +			 * block group so we can free the extent the next time +			 * around. +			 */  			spin_lock(&cache->space_info->lock);  			spin_lock(&cache->lock); -			cache->pinned -= len; -			cache->space_info->bytes_pinned -= len; +			unpin = (cache->cached == BTRFS_CACHE_FINISHED); +			if (likely(unpin)) { +				cache->pinned -= len; +				cache->space_info->bytes_pinned -= len; +				fs_info->total_pinned -= len; +			}  			spin_unlock(&cache->lock);  			spin_unlock(&cache->space_info->lock); -			fs_info->total_pinned -= len; -			if (cache->cached) + +			if (likely(unpin)) +				clear_extent_dirty(&fs_info->pinned_extents, +						   bytenr, bytenr + len -1, +						   GFP_NOFS); +			else +				cache_block_group(cache); + +			if (unpin)  				btrfs_add_free_space(cache, bytenr, len);  		}  		btrfs_put_block_group(cache); @@ -3030,6 +3149,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)  					    &start, &end, EXTENT_DIRTY);  		if (ret)  			break; +  		set_extent_dirty(copy, start, end, GFP_NOFS);  		last = end + 1;  	} @@ -3058,6 +3178,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  		cond_resched();  	} +  	return ret;  } @@ -3436,6 +3557,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)  }  /* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once.  So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes.  Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, +				u64 num_bytes) +{ +	DEFINE_WAIT(wait); + +	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); + +	if (block_group_cache_done(cache)) { +		finish_wait(&cache->caching_q, &wait); +		return 0; +	} +	schedule(); +	finish_wait(&cache->caching_q, &wait); + +	wait_event(cache->caching_q, block_group_cache_done(cache) || +		   (cache->free_space >= num_bytes)); +	return 0; +} + +enum btrfs_loop_type { +	LOOP_CACHED_ONLY = 0, +	LOOP_CACHING_NOWAIT = 1, +	LOOP_CACHING_WAIT = 2, +	LOOP_ALLOC_CHUNK = 3, +	LOOP_NO_EMPTY_SIZE = 4, +}; + +/*   * walks the btree of allocated extents and find a hole of a given size.   * The key ins is changed to record the hole:   * ins->objectid == block start @@ -3460,6 +3620,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	struct btrfs_space_info *space_info;  	int last_ptr_loop = 0;  	int loop = 0; +	bool found_uncached_bg = false;  	WARN_ON(num_bytes < root->sectorsize);  	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3491,15 +3652,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,  	search_start = max(search_start, first_logical_byte(root, 0));  	search_start = max(search_start, hint_byte); -	if (!last_ptr) { +	if (!last_ptr)  		empty_cluster = 0; -		loop = 1; -	}  	if (search_start == hint_byte) {  		block_group = btrfs_lookup_block_group(root->fs_info,  						       search_start); -		if (block_group && block_group_bits(block_group, data)) { +		/* +		 * we don't want to use the block group if it doesn't match our +		 * allocation bits, or if its not cached. +		 */ +		if (block_group && block_group_bits(block_group, data) && +		    block_group_cache_done(block_group)) {  			down_read(&space_info->groups_sem);  			if (list_empty(&block_group->list) ||  			    block_group->ro) { @@ -3522,21 +3686,35 @@ search:  	down_read(&space_info->groups_sem);  	list_for_each_entry(block_group, &space_info->block_groups, list) {  		u64 offset; +		int cached;  		atomic_inc(&block_group->count);  		search_start = block_group->key.objectid;  have_block_group: -		if (unlikely(!block_group->cached)) { -			mutex_lock(&block_group->cache_mutex); -			ret = cache_block_group(root, block_group); -			mutex_unlock(&block_group->cache_mutex); -			if (ret) { -				btrfs_put_block_group(block_group); -				break; +		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { +			/* +			 * we want to start caching kthreads, but not too many +			 * right off the bat so we don't overwhelm the system, +			 * so only start them if there are less than 2 and we're +			 * in the initial allocation phase. +			 */ +			if (loop > LOOP_CACHING_NOWAIT || +			    atomic_read(&space_info->caching_threads) < 2) { +				ret = cache_block_group(block_group); +				BUG_ON(ret);  			}  		} +		cached = block_group_cache_done(block_group); +		if (unlikely(!cached)) { +			found_uncached_bg = true; + +			/* if we only want cached bgs, loop */ +			if (loop == LOOP_CACHED_ONLY) +				goto loop; +		} +  		if (unlikely(block_group->ro))  			goto loop; @@ -3615,14 +3793,21 @@ refill_cluster:  					spin_unlock(&last_ptr->refill_lock);  					goto checks;  				} +			} else if (!cached && loop > LOOP_CACHING_NOWAIT) { +				spin_unlock(&last_ptr->refill_lock); + +				wait_block_group_cache_progress(block_group, +				       num_bytes + empty_cluster + empty_size); +				goto have_block_group;  			} +  			/*  			 * at this point we either didn't find a cluster  			 * or we weren't able to allocate a block from our  			 * cluster.  Free the cluster we've been trying  			 * to use, and go to the next block group  			 */ -			if (loop < 2) { +			if (loop < LOOP_NO_EMPTY_SIZE) {  				btrfs_return_cluster_to_free_space(NULL,  								   last_ptr);  				spin_unlock(&last_ptr->refill_lock); @@ -3633,11 +3818,17 @@ refill_cluster:  		offset = btrfs_find_space_for_alloc(block_group, search_start,  						    num_bytes, empty_size); -		if (!offset) +		if (!offset && (cached || (!cached && +					   loop == LOOP_CACHING_NOWAIT))) {  			goto loop; +		} else if (!offset && (!cached && +				       loop > LOOP_CACHING_NOWAIT)) { +			wait_block_group_cache_progress(block_group, +					num_bytes + empty_size); +			goto have_block_group; +		}  checks:  		search_start = stripe_align(root, offset); -  		/* move on to the next group */  		if (search_start + num_bytes >= search_end) {  			btrfs_add_free_space(block_group, offset, num_bytes); @@ -3683,13 +3874,26 @@ loop:  	}  	up_read(&space_info->groups_sem); -	/* loop == 0, try to find a clustered alloc in every block group -	 * loop == 1, try again after forcing a chunk allocation -	 * loop == 2, set empty_size and empty_cluster to 0 and try again +	/* LOOP_CACHED_ONLY, only search fully cached block groups +	 * LOOP_CACHING_NOWAIT, search partially cached block groups, but +	 *			dont wait foR them to finish caching +	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching +	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again +	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try +	 *			again  	 */ -	if (!ins->objectid && loop < 3 && -	    (empty_size || empty_cluster || allowed_chunk_alloc)) { -		if (loop >= 2) { +	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && +	    (found_uncached_bg || empty_size || empty_cluster || +	     allowed_chunk_alloc)) { +		if (found_uncached_bg) { +			found_uncached_bg = false; +			if (loop < LOOP_CACHING_WAIT) { +				loop++; +				goto search; +			} +		} + +		if (loop == LOOP_ALLOC_CHUNK) {  			empty_size = 0;  			empty_cluster = 0;  		} @@ -3702,7 +3906,7 @@ loop:  			space_info->force_alloc = 1;  		} -		if (loop < 3) { +		if (loop < LOOP_NO_EMPTY_SIZE) {  			loop++;  			goto search;  		} @@ -3798,7 +4002,7 @@ again:  			       num_bytes, data, 1);  		goto again;  	} -	if (ret) { +	if (ret == -ENOSPC) {  		struct btrfs_space_info *sinfo;  		sinfo = __find_space_info(root->fs_info, data); @@ -3806,7 +4010,6 @@ again:  		       "wanted %llu\n", (unsigned long long)data,  		       (unsigned long long)num_bytes);  		dump_space_info(sinfo, num_bytes); -		BUG();  	}  	return ret; @@ -3844,7 +4047,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,  	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,  				     empty_size, hint_byte, search_end, ins,  				     data); -	update_reserved_extents(root, ins->objectid, ins->offset, 1); +	if (!ret) +		update_reserved_extents(root, ins->objectid, ins->offset, 1); +  	return ret;  } @@ -4006,9 +4211,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  	struct btrfs_block_group_cache *block_group;  	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); -	mutex_lock(&block_group->cache_mutex); -	cache_block_group(root, block_group); -	mutex_unlock(&block_group->cache_mutex); +	cache_block_group(block_group); +	wait_event(block_group->caching_q, +		   block_group_cache_done(block_group));  	ret = btrfs_remove_free_space(block_group, ins->objectid,  				      ins->offset); @@ -4039,7 +4244,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,  	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,  				     empty_size, hint_byte, search_end,  				     ins, 0); -	BUG_ON(ret); +	if (ret) +		return ret;  	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {  		if (parent == 0) @@ -6955,11 +7161,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  			 &info->block_group_cache_tree);  		spin_unlock(&info->block_group_cache_lock); -		btrfs_remove_free_space_cache(block_group);  		down_write(&block_group->space_info->groups_sem);  		list_del(&block_group->list);  		up_write(&block_group->space_info->groups_sem); +		if (block_group->cached == BTRFS_CACHE_STARTED) +			wait_event(block_group->caching_q, +				   block_group_cache_done(block_group)); + +		btrfs_remove_free_space_cache(block_group); +  		WARN_ON(atomic_read(&block_group->count) != 1);  		kfree(block_group); @@ -7025,9 +7236,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		atomic_set(&cache->count, 1);  		spin_lock_init(&cache->lock);  		spin_lock_init(&cache->tree_lock); -		mutex_init(&cache->cache_mutex); +		cache->fs_info = info; +		init_waitqueue_head(&cache->caching_q);  		INIT_LIST_HEAD(&cache->list);  		INIT_LIST_HEAD(&cache->cluster_list); + +		/* +		 * we only want to have 32k of ram per block group for keeping +		 * track of free space, and if we pass 1/2 of that we want to +		 * start converting things over to using bitmaps +		 */ +		cache->extents_thresh = ((1024 * 32) / 2) / +			sizeof(struct btrfs_free_space); +  		read_extent_buffer(leaf, &cache->item,  				   btrfs_item_ptr_offset(leaf, path->slots[0]),  				   sizeof(cache->item)); @@ -7036,6 +7257,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)  		key.objectid = found_key.objectid + found_key.offset;  		btrfs_release_path(root, path);  		cache->flags = btrfs_block_group_flags(&cache->item); +		cache->sectorsize = root->sectorsize; + +		remove_sb_from_cache(root, cache); + +		/* +		 * check for two cases, either we are full, and therefore +		 * don't need to bother with the caching work since we won't +		 * find any space, or we are empty, and we can just add all +		 * the space in and be done with it.  This saves us _alot_ of +		 * time, particularly in the full case. +		 */ +		if (found_key.offset == btrfs_block_group_used(&cache->item)) { +			cache->cached = BTRFS_CACHE_FINISHED; +		} else if (btrfs_block_group_used(&cache->item) == 0) { +			cache->cached = BTRFS_CACHE_FINISHED; +			add_new_free_space(cache, root->fs_info, +					   found_key.objectid, +					   found_key.objectid + +					   found_key.offset); +		}  		ret = update_space_info(info, cache->flags, found_key.offset,  					btrfs_block_group_used(&cache->item), @@ -7079,10 +7320,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	cache->key.objectid = chunk_offset;  	cache->key.offset = size;  	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; +	cache->sectorsize = root->sectorsize; + +	/* +	 * we only want to have 32k of ram per block group for keeping track +	 * of free space, and if we pass 1/2 of that we want to start +	 * converting things over to using bitmaps +	 */ +	cache->extents_thresh = ((1024 * 32) / 2) / +		sizeof(struct btrfs_free_space);  	atomic_set(&cache->count, 1);  	spin_lock_init(&cache->lock);  	spin_lock_init(&cache->tree_lock); -	mutex_init(&cache->cache_mutex); +	init_waitqueue_head(&cache->caching_q);  	INIT_LIST_HEAD(&cache->list);  	INIT_LIST_HEAD(&cache->cluster_list); @@ -7091,6 +7341,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,  	cache->flags = type;  	btrfs_set_block_group_flags(&cache->item, type); +	cache->cached = BTRFS_CACHE_FINISHED; +	remove_sb_from_cache(root, cache); + +	add_new_free_space(cache, root->fs_info, chunk_offset, +			   chunk_offset + size); +  	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,  				&cache->space_info);  	BUG_ON(ret); @@ -7149,7 +7405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	rb_erase(&block_group->cache_node,  		 &root->fs_info->block_group_cache_tree);  	spin_unlock(&root->fs_info->block_group_cache_lock); -	btrfs_remove_free_space_cache(block_group); +  	down_write(&block_group->space_info->groups_sem);  	/*  	 * we must use list_del_init so people can check to see if they @@ -7158,11 +7414,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	list_del_init(&block_group->list);  	up_write(&block_group->space_info->groups_sem); +	if (block_group->cached == BTRFS_CACHE_STARTED) +		wait_event(block_group->caching_q, +			   block_group_cache_done(block_group)); + +	btrfs_remove_free_space_cache(block_group); +  	spin_lock(&block_group->space_info->lock);  	block_group->space_info->total_bytes -= block_group->key.offset;  	block_group->space_info->bytes_readonly -= block_group->key.offset;  	spin_unlock(&block_group->space_info->lock); -	block_group->space_info->full = 0; + +	btrfs_clear_space_info_full(root->fs_info);  	btrfs_put_block_group(block_group);  	btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4538e48581a..af99b78b288 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,45 +16,46 @@   * Boston, MA 021110-1307, USA.   */ +#include <linux/pagemap.h>  #include <linux/sched.h> +#include <linux/math64.h>  #include "ctree.h"  #include "free-space-cache.h"  #include "transaction.h" -struct btrfs_free_space { -	struct rb_node bytes_index; -	struct rb_node offset_index; -	u64 offset; -	u64 bytes; -}; +#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024) -static int tree_insert_offset(struct rb_root *root, u64 offset, -			      struct rb_node *node) +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, +					  u64 offset)  { -	struct rb_node **p = &root->rb_node; -	struct rb_node *parent = NULL; -	struct btrfs_free_space *info; +	BUG_ON(offset < bitmap_start); +	offset -= bitmap_start; +	return (unsigned long)(div64_u64(offset, sectorsize)); +} -	while (*p) { -		parent = *p; -		info = rb_entry(parent, struct btrfs_free_space, offset_index); +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ +	return (unsigned long)(div64_u64(bytes, sectorsize)); +} -		if (offset < info->offset) -			p = &(*p)->rb_left; -		else if (offset > info->offset) -			p = &(*p)->rb_right; -		else -			return -EEXIST; -	} +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, +				   u64 offset) +{ +	u64 bitmap_start; +	u64 bytes_per_bitmap; -	rb_link_node(node, parent, p); -	rb_insert_color(node, root); +	bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; +	bitmap_start = offset - block_group->key.objectid; +	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); +	bitmap_start *= bytes_per_bitmap; +	bitmap_start += block_group->key.objectid; -	return 0; +	return bitmap_start;  } -static int tree_insert_bytes(struct rb_root *root, u64 bytes, -			     struct rb_node *node) +static int tree_insert_offset(struct rb_root *root, u64 offset, +			      struct rb_node *node, int bitmap)  {  	struct rb_node **p = &root->rb_node;  	struct rb_node *parent = NULL; @@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,  	while (*p) {  		parent = *p; -		info = rb_entry(parent, struct btrfs_free_space, bytes_index); +		info = rb_entry(parent, struct btrfs_free_space, offset_index); -		if (bytes < info->bytes) +		if (offset < info->offset) {  			p = &(*p)->rb_left; -		else +		} else if (offset > info->offset) {  			p = &(*p)->rb_right; +		} else { +			/* +			 * we could have a bitmap entry and an extent entry +			 * share the same offset.  If this is the case, we want +			 * the extent entry to always be found first if we do a +			 * linear search through the tree, since we want to have +			 * the quickest allocation time, and allocating from an +			 * extent is faster than allocating from a bitmap.  So +			 * if we're inserting a bitmap and we find an entry at +			 * this offset, we want to go right, or after this entry +			 * logically.  If we are inserting an extent and we've +			 * found a bitmap, we want to go left, or before +			 * logically. +			 */ +			if (bitmap) { +				WARN_ON(info->bitmap); +				p = &(*p)->rb_right; +			} else { +				WARN_ON(!info->bitmap); +				p = &(*p)->rb_left; +			} +		}  	}  	rb_link_node(node, parent, p); @@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,  /*   * searches the tree for the given offset.   * - * fuzzy == 1: this is used for allocations where we are given a hint of where - * to look for free space.  Because the hint may not be completely on an offset - * mark, or the hint may no longer point to free space we need to fudge our - * results a bit.  So we look for free space starting at or after offset with at - * least bytes size.  We prefer to find as close to the given offset as we can. - * Also if the offset is within a free space range, then we will return the free - * space that contains the given offset, which means we can return a free space - * chunk with an offset before the provided offset. - * - * fuzzy == 0: this is just a normal tree search.  Give us the free space that - * starts at the given offset which is at least bytes size, and if its not there - * return NULL. + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset.   */ -static struct btrfs_free_space *tree_search_offset(struct rb_root *root, -						   u64 offset, u64 bytes, -						   int fuzzy) +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, +		   u64 offset, int bitmap_only, int fuzzy)  { -	struct rb_node *n = root->rb_node; -	struct btrfs_free_space *entry, *ret = NULL; +	struct rb_node *n = block_group->free_space_offset.rb_node; +	struct btrfs_free_space *entry, *prev = NULL; + +	/* find entry that is closest to the 'offset' */ +	while (1) { +		if (!n) { +			entry = NULL; +			break; +		} -	while (n) {  		entry = rb_entry(n, struct btrfs_free_space, offset_index); +		prev = entry; -		if (offset < entry->offset) { -			if (fuzzy && -			    (!ret || entry->offset < ret->offset) && -			    (bytes <= entry->bytes)) -				ret = entry; +		if (offset < entry->offset)  			n = n->rb_left; -		} else if (offset > entry->offset) { -			if (fuzzy && -			    (entry->offset + entry->bytes - 1) >= offset && -			    bytes <= entry->bytes) { -				ret = entry; -				break; -			} +		else if (offset > entry->offset)  			n = n->rb_right; -		} else { -			if (bytes > entry->bytes) { -				n = n->rb_right; -				continue; -			} -			ret = entry; +		else  			break; -		}  	} -	return ret; -} - -/* - * return a chunk at least bytes size, as close to offset that we can get. - */ -static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, -						  u64 offset, u64 bytes) -{ -	struct rb_node *n = root->rb_node; -	struct btrfs_free_space *entry, *ret = NULL; +	if (bitmap_only) { +		if (!entry) +			return NULL; +		if (entry->bitmap) +			return entry; -	while (n) { -		entry = rb_entry(n, struct btrfs_free_space, bytes_index); +		/* +		 * bitmap entry and extent entry may share same offset, +		 * in that case, bitmap entry comes after extent entry. +		 */ +		n = rb_next(n); +		if (!n) +			return NULL; +		entry = rb_entry(n, struct btrfs_free_space, offset_index); +		if (entry->offset != offset) +			return NULL; -		if (bytes < entry->bytes) { +		WARN_ON(!entry->bitmap); +		return entry; +	} else if (entry) { +		if (entry->bitmap) {  			/* -			 * We prefer to get a hole size as close to the size we -			 * are asking for so we don't take small slivers out of -			 * huge holes, but we also want to get as close to the -			 * offset as possible so we don't have a whole lot of -			 * fragmentation. +			 * if previous extent entry covers the offset, +			 * we should return it instead of the bitmap entry  			 */ -			if (offset <= entry->offset) { -				if (!ret) -					ret = entry; -				else if (entry->bytes < ret->bytes) -					ret = entry; -				else if (entry->offset < ret->offset) -					ret = entry; +			n = &entry->offset_index; +			while (1) { +				n = rb_prev(n); +				if (!n) +					break; +				prev = rb_entry(n, struct btrfs_free_space, +						offset_index); +				if (!prev->bitmap) { +					if (prev->offset + prev->bytes > offset) +						entry = prev; +					break; +				}  			} -			n = n->rb_left; -		} else if (bytes > entry->bytes) { -			n = n->rb_right; +		} +		return entry; +	} + +	if (!prev) +		return NULL; + +	/* find last entry before the 'offset' */ +	entry = prev; +	if (entry->offset > offset) { +		n = rb_prev(&entry->offset_index); +		if (n) { +			entry = rb_entry(n, struct btrfs_free_space, +					offset_index); +			BUG_ON(entry->offset > offset);  		} else { -			/* -			 * Ok we may have multiple chunks of the wanted size, -			 * so we don't want to take the first one we find, we -			 * want to take the one closest to our given offset, so -			 * keep searching just in case theres a better match. -			 */ -			n = n->rb_right; -			if (offset > entry->offset) -				continue; -			else if (!ret || entry->offset < ret->offset) -				ret = entry; +			if (fuzzy) +				return entry; +			else +				return NULL;  		}  	} -	return ret; +	if (entry->bitmap) { +		n = &entry->offset_index; +		while (1) { +			n = rb_prev(n); +			if (!n) +				break; +			prev = rb_entry(n, struct btrfs_free_space, +					offset_index); +			if (!prev->bitmap) { +				if (prev->offset + prev->bytes > offset) +					return prev; +				break; +			} +		} +		if (entry->offset + BITS_PER_BITMAP * +		    block_group->sectorsize > offset) +			return entry; +	} else if (entry->offset + entry->bytes > offset) +		return entry; + +	if (!fuzzy) +		return NULL; + +	while (1) { +		if (entry->bitmap) { +			if (entry->offset + BITS_PER_BITMAP * +			    block_group->sectorsize > offset) +				break; +		} else { +			if (entry->offset + entry->bytes > offset) +				break; +		} + +		n = rb_next(&entry->offset_index); +		if (!n) +			return NULL; +		entry = rb_entry(n, struct btrfs_free_space, offset_index); +	} +	return entry;  }  static void unlink_free_space(struct btrfs_block_group_cache *block_group,  			      struct btrfs_free_space *info)  {  	rb_erase(&info->offset_index, &block_group->free_space_offset); -	rb_erase(&info->bytes_index, &block_group->free_space_bytes); +	block_group->free_extents--; +	block_group->free_space -= info->bytes;  }  static int link_free_space(struct btrfs_block_group_cache *block_group, @@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,  {  	int ret = 0; - -	BUG_ON(!info->bytes); +	BUG_ON(!info->bitmap && !info->bytes);  	ret = tree_insert_offset(&block_group->free_space_offset, info->offset, -				 &info->offset_index); +				 &info->offset_index, (info->bitmap != NULL));  	if (ret)  		return ret; -	ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, -				&info->bytes_index); -	if (ret) -		return ret; +	block_group->free_space += info->bytes; +	block_group->free_extents++; +	return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ +	u64 max_bytes, possible_bytes; + +	/* +	 * The goal is to keep the total amount of memory used per 1gb of space +	 * at or below 32k, so we need to adjust how much memory we allow to be +	 * used by extent based free space tracking +	 */ +	max_bytes = MAX_CACHE_BYTES_PER_GIG * +		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + +	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + +		(sizeof(struct btrfs_free_space) * +		 block_group->extents_thresh); + +	if (possible_bytes > max_bytes) { +		int extent_bytes = max_bytes - +			(block_group->total_bitmaps * PAGE_CACHE_SIZE); + +		if (extent_bytes <= 0) { +			block_group->extents_thresh = 0; +			return; +		} + +		block_group->extents_thresh = extent_bytes / +			(sizeof(struct btrfs_free_space)); +	} +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, +			      struct btrfs_free_space *info, u64 offset, +			      u64 bytes) +{ +	unsigned long start, end; +	unsigned long i; + +	start = offset_to_bit(info->offset, block_group->sectorsize, offset); +	end = start + bytes_to_bits(bytes, block_group->sectorsize); +	BUG_ON(end > BITS_PER_BITMAP); + +	for (i = start; i < end; i++) +		clear_bit(i, info->bitmap); + +	info->bytes -= bytes; +	block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, +			    struct btrfs_free_space *info, u64 offset, +			    u64 bytes) +{ +	unsigned long start, end; +	unsigned long i; + +	start = offset_to_bit(info->offset, block_group->sectorsize, offset); +	end = start + bytes_to_bits(bytes, block_group->sectorsize); +	BUG_ON(end > BITS_PER_BITMAP); + +	for (i = start; i < end; i++) +		set_bit(i, info->bitmap); + +	info->bytes += bytes; +	block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, +			 struct btrfs_free_space *bitmap_info, u64 *offset, +			 u64 *bytes) +{ +	unsigned long found_bits = 0; +	unsigned long bits, i; +	unsigned long next_zero; + +	i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, +			  max_t(u64, *offset, bitmap_info->offset)); +	bits = bytes_to_bits(*bytes, block_group->sectorsize); + +	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); +	     i < BITS_PER_BITMAP; +	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { +		next_zero = find_next_zero_bit(bitmap_info->bitmap, +					       BITS_PER_BITMAP, i); +		if ((next_zero - i) >= bits) { +			found_bits = next_zero - i; +			break; +		} +		i = next_zero; +	} + +	if (found_bits) { +		*offset = (u64)(i * block_group->sectorsize) + +			bitmap_info->offset; +		*bytes = (u64)(found_bits) * block_group->sectorsize; +		return 0; +	} + +	return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache +						*block_group, u64 *offset, +						u64 *bytes, int debug) +{ +	struct btrfs_free_space *entry; +	struct rb_node *node; +	int ret; + +	if (!block_group->free_space_offset.rb_node) +		return NULL; + +	entry = tree_search_offset(block_group, +				   offset_to_bitmap(block_group, *offset), +				   0, 1); +	if (!entry) +		return NULL; + +	for (node = &entry->offset_index; node; node = rb_next(node)) { +		entry = rb_entry(node, struct btrfs_free_space, offset_index); +		if (entry->bytes < *bytes) +			continue; + +		if (entry->bitmap) { +			ret = search_bitmap(block_group, entry, offset, bytes); +			if (!ret) +				return entry; +			continue; +		} + +		*offset = entry->offset; +		*bytes = entry->bytes; +		return entry; +	} + +	return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, +			   struct btrfs_free_space *info, u64 offset) +{ +	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; +	int max_bitmaps = (int)div64_u64(block_group->key.offset + +					 bytes_per_bg - 1, bytes_per_bg); +	BUG_ON(block_group->total_bitmaps >= max_bitmaps); + +	info->offset = offset_to_bitmap(block_group, offset); +	link_free_space(block_group, info); +	block_group->total_bitmaps++; + +	recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, +			      struct btrfs_free_space *bitmap_info, +			      u64 *offset, u64 *bytes) +{ +	u64 end; + +again: +	end = bitmap_info->offset + +		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + +	if (*offset > bitmap_info->offset && *offset + *bytes > end) { +		bitmap_clear_bits(block_group, bitmap_info, *offset, +				  end - *offset + 1); +		*bytes -= end - *offset + 1; +		*offset = end + 1; +	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { +		bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); +		*bytes = 0; +	} + +	if (*bytes) { +		if (!bitmap_info->bytes) { +			unlink_free_space(block_group, bitmap_info); +			kfree(bitmap_info->bitmap); +			kfree(bitmap_info); +			block_group->total_bitmaps--; +			recalculate_thresholds(block_group); +		} + +		bitmap_info = tree_search_offset(block_group, +						 offset_to_bitmap(block_group, +								  *offset), +						 1, 0); +		if (!bitmap_info) +			return -EINVAL; + +		if (!bitmap_info->bitmap) +			return -EAGAIN; + +		goto again; +	} else if (!bitmap_info->bytes) { +		unlink_free_space(block_group, bitmap_info); +		kfree(bitmap_info->bitmap); +		kfree(bitmap_info); +		block_group->total_bitmaps--; +		recalculate_thresholds(block_group); +	} + +	return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, +			      struct btrfs_free_space *info) +{ +	struct btrfs_free_space *bitmap_info; +	int added = 0; +	u64 bytes, offset, end; +	int ret; + +	/* +	 * If we are below the extents threshold then we can add this as an +	 * extent, and don't have to deal with the bitmap +	 */ +	if (block_group->free_extents < block_group->extents_thresh && +	    info->bytes > block_group->sectorsize * 4) +		return 0; + +	/* +	 * some block groups are so tiny they can't be enveloped by a bitmap, so +	 * don't even bother to create a bitmap for this +	 */ +	if (BITS_PER_BITMAP * block_group->sectorsize > +	    block_group->key.offset) +		return 0; + +	bytes = info->bytes; +	offset = info->offset; + +again: +	bitmap_info = tree_search_offset(block_group, +					 offset_to_bitmap(block_group, offset), +					 1, 0); +	if (!bitmap_info) { +		BUG_ON(added); +		goto new_bitmap; +	} + +	end = bitmap_info->offset + +		(u64)(BITS_PER_BITMAP * block_group->sectorsize); + +	if (offset >= bitmap_info->offset && offset + bytes > end) { +		bitmap_set_bits(block_group, bitmap_info, offset, +				end - offset); +		bytes -= end - offset; +		offset = end; +		added = 0; +	} else if (offset >= bitmap_info->offset && offset + bytes <= end) { +		bitmap_set_bits(block_group, bitmap_info, offset, bytes); +		bytes = 0; +	} else { +		BUG(); +	} + +	if (!bytes) { +		ret = 1; +		goto out; +	} else +		goto again; + +new_bitmap: +	if (info && info->bitmap) { +		add_new_bitmap(block_group, info, offset); +		added = 1; +		info = NULL; +		goto again; +	} else { +		spin_unlock(&block_group->tree_lock); + +		/* no pre-allocated info, allocate a new one */ +		if (!info) { +			info = kzalloc(sizeof(struct btrfs_free_space), +				       GFP_NOFS); +			if (!info) { +				spin_lock(&block_group->tree_lock); +				ret = -ENOMEM; +				goto out; +			} +		} + +		/* allocate the bitmap */ +		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); +		spin_lock(&block_group->tree_lock); +		if (!info->bitmap) { +			ret = -ENOMEM; +			goto out; +		} +		goto again; +	} + +out: +	if (info) { +		if (info->bitmap) +			kfree(info->bitmap); +		kfree(info); +	}  	return ret;  } @@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,  int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,  			 u64 offset, u64 bytes)  { -	struct btrfs_free_space *right_info; -	struct btrfs_free_space *left_info; +	struct btrfs_free_space *right_info = NULL; +	struct btrfs_free_space *left_info = NULL;  	struct btrfs_free_space *info = NULL;  	int ret = 0; @@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,  	 * are adding, if there is remove that struct and add a new one to  	 * cover the entire range  	 */ -	right_info = tree_search_offset(&block_group->free_space_offset, -					offset+bytes, 0, 0); -	left_info = tree_search_offset(&block_group->free_space_offset, -				       offset-1, 0, 1); +	right_info = tree_search_offset(block_group, offset + bytes, 0, 0); +	if (right_info && rb_prev(&right_info->offset_index)) +		left_info = rb_entry(rb_prev(&right_info->offset_index), +				     struct btrfs_free_space, offset_index); +	else +		left_info = tree_search_offset(block_group, offset - 1, 0, 0); -	if (right_info) { +	/* +	 * If there was no extent directly to the left or right of this new +	 * extent then we know we're going to have to allocate a new extent, so +	 * before we do that see if we need to drop this into a bitmap +	 */ +	if ((!left_info || left_info->bitmap) && +	    (!right_info || right_info->bitmap)) { +		ret = insert_into_bitmap(block_group, info); + +		if (ret < 0) { +			goto out; +		} else if (ret) { +			ret = 0; +			goto out; +		} +	} + +	if (right_info && !right_info->bitmap) {  		unlink_free_space(block_group, right_info);  		info->bytes += right_info->bytes;  		kfree(right_info);  	} -	if (left_info && left_info->offset + left_info->bytes == offset) { +	if (left_info && !left_info->bitmap && +	    left_info->offset + left_info->bytes == offset) {  		unlink_free_space(block_group, left_info);  		info->offset = left_info->offset;  		info->bytes += left_info->bytes; @@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,  	ret = link_free_space(block_group, info);  	if (ret)  		kfree(info); - +out:  	spin_unlock(&block_group->tree_lock);  	if (ret) { -		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); +		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);  		BUG_ON(ret == -EEXIST);  	} @@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  			    u64 offset, u64 bytes)  {  	struct btrfs_free_space *info; +	struct btrfs_free_space *next_info = NULL;  	int ret = 0;  	spin_lock(&block_group->tree_lock); -	info = tree_search_offset(&block_group->free_space_offset, offset, 0, -				  1); -	if (info && info->offset == offset) { -		if (info->bytes < bytes) { -			printk(KERN_ERR "Found free space at %llu, size %llu," -			       "trying to use %llu\n", -			       (unsigned long long)info->offset, -			       (unsigned long long)info->bytes, -			       (unsigned long long)bytes); +again: +	info = tree_search_offset(block_group, offset, 0, 0); +	if (!info) { +		WARN_ON(1); +		goto out_lock; +	} + +	if (info->bytes < bytes && rb_next(&info->offset_index)) { +		u64 end; +		next_info = rb_entry(rb_next(&info->offset_index), +					     struct btrfs_free_space, +					     offset_index); + +		if (next_info->bitmap) +			end = next_info->offset + BITS_PER_BITMAP * +				block_group->sectorsize - 1; +		else +			end = next_info->offset + next_info->bytes; + +		if (next_info->bytes < bytes || +		    next_info->offset > offset || offset > end) { +			printk(KERN_CRIT "Found free space at %llu, size %llu," +			      " trying to use %llu\n", +			      (unsigned long long)info->offset, +			      (unsigned long long)info->bytes, +			      (unsigned long long)bytes);  			WARN_ON(1);  			ret = -EINVAL; -			spin_unlock(&block_group->tree_lock); -			goto out; +			goto out_lock;  		} -		unlink_free_space(block_group, info); -		if (info->bytes == bytes) { -			kfree(info); -			spin_unlock(&block_group->tree_lock); -			goto out; +		info = next_info; +	} + +	if (info->bytes == bytes) { +		unlink_free_space(block_group, info); +		if (info->bitmap) { +			kfree(info->bitmap); +			block_group->total_bitmaps--;  		} +		kfree(info); +		goto out_lock; +	} +	if (!info->bitmap && info->offset == offset) { +		unlink_free_space(block_group, info);  		info->offset += bytes;  		info->bytes -= bytes; +		link_free_space(block_group, info); +		goto out_lock; +	} -		ret = link_free_space(block_group, info); -		spin_unlock(&block_group->tree_lock); -		BUG_ON(ret); -	} else if (info && info->offset < offset && -		   info->offset + info->bytes >= offset + bytes) { +	if (!info->bitmap && info->offset <= offset && +	    info->offset + info->bytes >= offset + bytes) {  		u64 old_start = info->offset;  		/*  		 * we're freeing space in the middle of the info, @@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  			info->offset = offset + bytes;  			info->bytes = old_end - info->offset;  			ret = link_free_space(block_group, info); -			BUG_ON(ret); +			WARN_ON(ret); +			if (ret) +				goto out_lock;  		} else {  			/* the hole we're creating ends at the end  			 * of the info struct, just free the info @@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  			kfree(info);  		}  		spin_unlock(&block_group->tree_lock); -		/* step two, insert a new info struct to cover anything -		 * before the hole + +		/* step two, insert a new info struct to cover +		 * anything before the hole  		 */  		ret = btrfs_add_free_space(block_group, old_start,  					   offset - old_start); -		BUG_ON(ret); -	} else { -		spin_unlock(&block_group->tree_lock); -		if (!info) { -			printk(KERN_ERR "couldn't find space %llu to free\n", -			       (unsigned long long)offset); -			printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", -			       block_group->cached, -			       (unsigned long long)block_group->key.objectid, -			       (unsigned long long)block_group->key.offset); -			btrfs_dump_free_space(block_group, bytes); -		} else if (info) { -			printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " -			       "but wanted offset=%llu bytes=%llu\n", -			       (unsigned long long)info->offset, -			       (unsigned long long)info->bytes, -			       (unsigned long long)offset, -			       (unsigned long long)bytes); -		} -		WARN_ON(1); +		WARN_ON(ret); +		goto out;  	} + +	ret = remove_from_bitmap(block_group, info, &offset, &bytes); +	if (ret == -EAGAIN) +		goto again; +	BUG_ON(ret); +out_lock: +	spin_unlock(&block_group->tree_lock);  out:  	return ret;  } @@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,  		info = rb_entry(n, struct btrfs_free_space, offset_index);  		if (info->bytes >= bytes)  			count++; -		printk(KERN_ERR "entry offset %llu, bytes %llu\n", +		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",  		       (unsigned long long)info->offset, -		       (unsigned long long)info->bytes); +		       (unsigned long long)info->bytes, +		       (info->bitmap) ? "yes" : "no");  	} +	printk(KERN_INFO "block group has cluster?: %s\n", +	       list_empty(&block_group->cluster_list) ? "no" : "yes");  	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"  	       "\n", count);  } @@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space(  {  	struct btrfs_free_space *entry;  	struct rb_node *node; +	bool bitmap;  	spin_lock(&cluster->lock);  	if (cluster->block_group != block_group)  		goto out; +	bitmap = cluster->points_to_bitmap; +	cluster->block_group = NULL;  	cluster->window_start = 0; +	list_del_init(&cluster->block_group_list); +	cluster->points_to_bitmap = false; + +	if (bitmap) +		goto out; +  	node = rb_first(&cluster->root); -	while(node) { +	while (node) {  		entry = rb_entry(node, struct btrfs_free_space, offset_index);  		node = rb_next(&entry->offset_index);  		rb_erase(&entry->offset_index, &cluster->root); -		link_free_space(block_group, entry); +		BUG_ON(entry->bitmap); +		tree_insert_offset(&block_group->free_space_offset, +				   entry->offset, &entry->offset_index, 0);  	} -	list_del_init(&cluster->block_group_list); - -	btrfs_put_block_group(cluster->block_group); -	cluster->block_group = NULL;  	cluster->root.rb_node = NULL; +  out:  	spin_unlock(&cluster->lock); +	btrfs_put_block_group(block_group);  	return 0;  } @@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)  	struct btrfs_free_space *info;  	struct rb_node *node;  	struct btrfs_free_cluster *cluster; -	struct btrfs_free_cluster *safe; +	struct list_head *head;  	spin_lock(&block_group->tree_lock); - -	list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, -				 block_group_list) { +	while ((head = block_group->cluster_list.next) != +	       &block_group->cluster_list) { +		cluster = list_entry(head, struct btrfs_free_cluster, +				     block_group_list);  		WARN_ON(cluster->block_group != block_group);  		__btrfs_return_cluster_to_free_space(block_group, cluster); +		if (need_resched()) { +			spin_unlock(&block_group->tree_lock); +			cond_resched(); +			spin_lock(&block_group->tree_lock); +		}  	} -	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { -		info = rb_entry(node, struct btrfs_free_space, bytes_index); +	while ((node = rb_last(&block_group->free_space_offset)) != NULL) { +		info = rb_entry(node, struct btrfs_free_space, offset_index);  		unlink_free_space(block_group, info); +		if (info->bitmap) +			kfree(info->bitmap);  		kfree(info);  		if (need_resched()) {  			spin_unlock(&block_group->tree_lock); @@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)  			spin_lock(&block_group->tree_lock);  		}  	} +  	spin_unlock(&block_group->tree_lock);  } @@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,  			       u64 offset, u64 bytes, u64 empty_size)  {  	struct btrfs_free_space *entry = NULL; +	u64 bytes_search = bytes + empty_size;  	u64 ret = 0;  	spin_lock(&block_group->tree_lock); -	entry = tree_search_offset(&block_group->free_space_offset, offset, -				   bytes + empty_size, 1); +	entry = find_free_space(block_group, &offset, &bytes_search, 0);  	if (!entry) -		entry = tree_search_bytes(&block_group->free_space_bytes, -					  offset, bytes + empty_size); -	if (entry) { +		goto out; + +	ret = offset; +	if (entry->bitmap) { +		bitmap_clear_bits(block_group, entry, offset, bytes); +		if (!entry->bytes) { +			unlink_free_space(block_group, entry); +			kfree(entry->bitmap); +			kfree(entry); +			block_group->total_bitmaps--; +			recalculate_thresholds(block_group); +		} +	} else {  		unlink_free_space(block_group, entry); -		ret = entry->offset;  		entry->offset += bytes;  		entry->bytes -= bytes; -  		if (!entry->bytes)  			kfree(entry);  		else  			link_free_space(block_group, entry);  	} + +out:  	spin_unlock(&block_group->tree_lock);  	return ret; @@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space(  	return ret;  } +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, +				   struct btrfs_free_cluster *cluster, +				   u64 bytes, u64 min_start) +{ +	struct btrfs_free_space *entry; +	int err; +	u64 search_start = cluster->window_start; +	u64 search_bytes = bytes; +	u64 ret = 0; + +	spin_lock(&block_group->tree_lock); +	spin_lock(&cluster->lock); + +	if (!cluster->points_to_bitmap) +		goto out; + +	if (cluster->block_group != block_group) +		goto out; + +	entry = tree_search_offset(block_group, search_start, 0, 0); + +	if (!entry || !entry->bitmap) +		goto out; + +	search_start = min_start; +	search_bytes = bytes; + +	err = search_bitmap(block_group, entry, &search_start, +			    &search_bytes); +	if (err) +		goto out; + +	ret = search_start; +	bitmap_clear_bits(block_group, entry, ret, bytes); +out: +	spin_unlock(&cluster->lock); +	spin_unlock(&block_group->tree_lock); + +	return ret; +} +  /*   * given a cluster, try to allocate 'bytes' from it, returns 0   * if it couldn't find anything suitably large, or a logical disk offset @@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  	struct rb_node *node;  	u64 ret = 0; +	if (cluster->points_to_bitmap) +		return btrfs_alloc_from_bitmap(block_group, cluster, bytes, +					       min_start); +  	spin_lock(&cluster->lock);  	if (bytes > cluster->max_size)  		goto out; @@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  	}  out:  	spin_unlock(&cluster->lock); +  	return ret;  } +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, +				struct btrfs_free_space *entry, +				struct btrfs_free_cluster *cluster, +				u64 offset, u64 bytes, u64 min_bytes) +{ +	unsigned long next_zero; +	unsigned long i; +	unsigned long search_bits; +	unsigned long total_bits; +	unsigned long found_bits; +	unsigned long start = 0; +	unsigned long total_found = 0; +	bool found = false; + +	i = offset_to_bit(entry->offset, block_group->sectorsize, +			  max_t(u64, offset, entry->offset)); +	search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); +	total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: +	found_bits = 0; +	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); +	     i < BITS_PER_BITMAP; +	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { +		next_zero = find_next_zero_bit(entry->bitmap, +					       BITS_PER_BITMAP, i); +		if (next_zero - i >= search_bits) { +			found_bits = next_zero - i; +			break; +		} +		i = next_zero; +	} + +	if (!found_bits) +		return -1; + +	if (!found) { +		start = i; +		found = true; +	} + +	total_found += found_bits; + +	if (cluster->max_size < found_bits * block_group->sectorsize) +		cluster->max_size = found_bits * block_group->sectorsize; + +	if (total_found < total_bits) { +		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); +		if (i - start > total_bits * 2) { +			total_found = 0; +			cluster->max_size = 0; +			found = false; +		} +		goto again; +	} + +	cluster->window_start = start * block_group->sectorsize + +		entry->offset; +	cluster->points_to_bitmap = true; + +	return 0; +} +  /*   * here we try to find a cluster of blocks in a block group.  The goal   * is to find at least bytes free and up to empty_size + bytes free. @@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,  	struct btrfs_free_space *entry = NULL;  	struct rb_node *node;  	struct btrfs_free_space *next; -	struct btrfs_free_space *last; +	struct btrfs_free_space *last = NULL;  	u64 min_bytes;  	u64 window_start;  	u64 window_free;  	u64 max_extent = 0; -	int total_retries = 0; +	bool found_bitmap = false;  	int ret;  	/* for metadata, allow allocates with more holes */ @@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,  		goto out;  	}  again: -	min_bytes = min(min_bytes, bytes + empty_size); -	entry = tree_search_bytes(&block_group->free_space_bytes, -				  offset, min_bytes); +	entry = tree_search_offset(block_group, offset, found_bitmap, 1);  	if (!entry) {  		ret = -ENOSPC;  		goto out;  	} + +	/* +	 * If found_bitmap is true, we exhausted our search for extent entries, +	 * and we just want to search all of the bitmaps that we can find, and +	 * ignore any extent entries we find. +	 */ +	while (entry->bitmap || found_bitmap || +	       (!entry->bitmap && entry->bytes < min_bytes)) { +		struct rb_node *node = rb_next(&entry->offset_index); + +		if (entry->bitmap && entry->bytes > bytes + empty_size) { +			ret = btrfs_bitmap_cluster(block_group, entry, cluster, +						   offset, bytes + empty_size, +						   min_bytes); +			if (!ret) +				goto got_it; +		} + +		if (!node) { +			ret = -ENOSPC; +			goto out; +		} +		entry = rb_entry(node, struct btrfs_free_space, offset_index); +	} + +	/* +	 * We already searched all the extent entries from the passed in offset +	 * to the end and didn't find enough space for the cluster, and we also +	 * didn't find any bitmaps that met our criteria, just go ahead and exit +	 */ +	if (found_bitmap) { +		ret = -ENOSPC; +		goto out; +	} + +	cluster->points_to_bitmap = false;  	window_start = entry->offset;  	window_free = entry->bytes;  	last = entry;  	max_extent = entry->bytes; -	while(1) { +	while (1) {  		/* out window is just right, lets fill it */  		if (window_free >= bytes + empty_size)  			break;  		node = rb_next(&last->offset_index);  		if (!node) { +			if (found_bitmap) +				goto again;  			ret = -ENOSPC;  			goto out;  		}  		next = rb_entry(node, struct btrfs_free_space, offset_index);  		/* +		 * we found a bitmap, so if this search doesn't result in a +		 * cluster, we know to go and search again for the bitmaps and +		 * start looking for space there +		 */ +		if (next->bitmap) { +			if (!found_bitmap) +				offset = next->offset; +			found_bitmap = true; +			last = next; +			continue; +		} + +		/*  		 * we haven't filled the empty size and the window is  		 * very large.  reset and try again  		 */ @@ -655,19 +1234,6 @@ again:  			window_free = entry->bytes;  			last = entry;  			max_extent = 0; -			total_retries++; -			if (total_retries % 64 == 0) { -				if (min_bytes >= (bytes + empty_size)) { -					ret = -ENOSPC; -					goto out; -				} -				/* -				 * grow our allocation a bit, we're not having -				 * much luck -				 */ -				min_bytes *= 2; -				goto again; -			}  		} else {  			last = next;  			window_free += next->bytes; @@ -685,11 +1251,19 @@ again:  	 * The cluster includes an rbtree, but only uses the offset index  	 * of each free space cache entry.  	 */ -	while(1) { +	while (1) {  		node = rb_next(&entry->offset_index); -		unlink_free_space(block_group, entry); +		if (entry->bitmap && node) { +			entry = rb_entry(node, struct btrfs_free_space, +					 offset_index); +			continue; +		} else if (entry->bitmap && !node) { +			break; +		} + +		rb_erase(&entry->offset_index, &block_group->free_space_offset);  		ret = tree_insert_offset(&cluster->root, entry->offset, -					 &entry->offset_index); +					 &entry->offset_index, 0);  		BUG_ON(ret);  		if (!node || entry == last) @@ -697,8 +1271,10 @@ again:  		entry = rb_entry(node, struct btrfs_free_space, offset_index);  	} -	ret = 0; +  	cluster->max_size = max_extent; +got_it: +	ret = 0;  	atomic_inc(&block_group->count);  	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);  	cluster->block_group = block_group; @@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)  	spin_lock_init(&cluster->refill_lock);  	cluster->root.rb_node = NULL;  	cluster->max_size = 0; +	cluster->points_to_bitmap = false;  	INIT_LIST_HEAD(&cluster->block_group_list);  	cluster->block_group = NULL;  } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 266fb876405..890a8e79011 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -19,6 +19,14 @@  #ifndef __BTRFS_FREE_SPACE_CACHE  #define __BTRFS_FREE_SPACE_CACHE +struct btrfs_free_space { +	struct rb_node offset_index; +	u64 offset; +	u64 bytes; +	unsigned long *bitmap; +	struct list_head list; +}; +  int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,  			 u64 bytenr, u64 size);  int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 791eab19e33..56fe83fa60c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	if (root->ref_cows)  		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);  	path = btrfs_alloc_path(); -	path->reada = -1;  	BUG_ON(!path); +	path->reada = -1;  	/* FIXME, add redo link to tree so we don't leak on crash */  	key.objectid = inode->i_ino; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6d6523da0a3..0d126be22b6 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)  	}  	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",  	       (unsigned long long)btrfs_header_bytenr(c), -	       btrfs_header_level(c), nr, +	      level, nr,  	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);  	for (i = 0; i < nr; i++) {  		btrfs_node_key_to_cpu(c, &key, i); @@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)  					btrfs_level_size(root, level - 1),  					btrfs_node_ptr_generation(c, i));  		if (btrfs_is_leaf(next) && -		    btrfs_header_level(c) != 1) +		   level != 1)  			BUG();  		if (btrfs_header_level(next) != -			btrfs_header_level(c) - 1) +		       level - 1)  			BUG();  		btrfs_print_tree(root, next);  		free_extent_buffer(next); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 00839793477..e71264d1c2c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -670,6 +670,8 @@ again:  			err = ret;  			goto out;  		} +		if (ret > 0 && path2->slots[level] > 0) +			path2->slots[level]--;  		eb = path2->nodes[level];  		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != @@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  		BUG_ON(level == 0);  		path->lowest_level = level;  		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); +		path->lowest_level = 0;  		if (ret < 0) {  			btrfs_free_path(path);  			return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2dbf1c1f56e..cdbb5022da5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)  	}  } +static noinline void switch_commit_root(struct btrfs_root *root) +{ +	free_extent_buffer(root->commit_root); +	root->commit_root = btrfs_root_node(root); +} +  /*   * either allocate a new transaction or hop into the existing one   */ @@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,  	btrfs_write_dirty_block_groups(trans, root); -	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); -	BUG_ON(ret); -  	while (1) {  		old_root_bytenr = btrfs_root_bytenr(&root->root_item);  		if (old_root_bytenr == root->node->start) @@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,  					&root->root_key,  					&root->root_item);  		BUG_ON(ret); -		btrfs_write_dirty_block_groups(trans, root); -		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); +		ret = btrfs_write_dirty_block_groups(trans, root);  		BUG_ON(ret);  	} -	free_extent_buffer(root->commit_root); -	root->commit_root = btrfs_root_node(root); + +	if (root != root->fs_info->extent_root) +		switch_commit_root(root); +  	return 0;  } @@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,  		root = list_entry(next, struct btrfs_root, dirty_list);  		update_cowonly_root(trans, root); - -		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); -		BUG_ON(ret);  	} + +	down_write(&fs_info->extent_commit_sem); +	switch_commit_root(fs_info->extent_root); +	up_write(&fs_info->extent_commit_sem); +  	return 0;  } @@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,  			btrfs_update_reloc_root(trans, root);  			if (root->commit_root != root->node) { -				free_extent_buffer(root->commit_root); -				root->commit_root = btrfs_root_node(root); +				switch_commit_root(root);  				btrfs_set_root_node(&root->root_item,  						    root->node);  			} @@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)  	super->root_level = root_item->level;  } +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ +	int ret = 0; +	spin_lock(&info->new_trans_lock); +	if (info->running_transaction) +		ret = info->running_transaction->in_commit; +	spin_unlock(&info->new_trans_lock); +	return ret; +} +  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root)  { @@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  		mutex_unlock(&root->fs_info->trans_mutex); -		if (flush_on_commit || snap_pending) { -			if (flush_on_commit) -				btrfs_start_delalloc_inodes(root); +		if (flush_on_commit) { +			btrfs_start_delalloc_inodes(root); +			ret = btrfs_wait_ordered_extents(root, 0); +			BUG_ON(ret); +		} else if (snap_pending) {  			ret = btrfs_wait_ordered_extents(root, 1);  			BUG_ON(ret);  		} @@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	btrfs_set_root_node(&root->fs_info->tree_root->root_item,  			    root->fs_info->tree_root->node); -	free_extent_buffer(root->fs_info->tree_root->commit_root); -	root->fs_info->tree_root->commit_root = -				btrfs_root_node(root->fs_info->tree_root); +	switch_commit_root(root->fs_info->tree_root);  	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,  			    root->fs_info->chunk_root->node); -	free_extent_buffer(root->fs_info->chunk_root->commit_root); -	root->fs_info->chunk_root->commit_root = -				btrfs_root_node(root->fs_info->chunk_root); +	switch_commit_root(root->fs_info->chunk_root);  	update_super_roots(root); @@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	cur_trans->commit_done = 1;  	root->fs_info->last_trans_committed = cur_trans->transid; +  	wake_up(&cur_trans->commit_wait);  	put_transaction(cur_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 961c3ee5a2e..663c6740491 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,  				struct btrfs_root *root);  int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,  					struct extent_io_tree *dirty_pages); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info);  #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c13922206d1..d91b0de7c50 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,  		return -ENOENT;  	inode = read_one_inode(root, key->objectid); -	BUG_ON(!dir); +	BUG_ON(!inode);  	ref_ptr = btrfs_item_ptr_offset(eb, slot);  	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9cd76..5dbefd11b4a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -721,7 +721,8 @@ error:   */  static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,  					 struct btrfs_device *device, -					 u64 num_bytes, u64 *start) +					 u64 num_bytes, u64 *start, +					 u64 *max_avail)  {  	struct btrfs_key key;  	struct btrfs_root *root = device->dev_root; @@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,  	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);  	if (ret < 0)  		goto error; -	ret = btrfs_previous_item(root, path, 0, key.type); -	if (ret < 0) -		goto error; +	if (ret > 0) { +		ret = btrfs_previous_item(root, path, key.objectid, key.type); +		if (ret < 0) +			goto error; +		if (ret > 0) +			start_found = 1; +	}  	l = path->nodes[0];  	btrfs_item_key_to_cpu(l, &key, path->slots[0]);  	while (1) { @@ -803,6 +808,10 @@ no_more_items:  			if (last_byte < search_start)  				last_byte = search_start;  			hole_size = key.offset - last_byte; + +			if (hole_size > *max_avail) +				*max_avail = hole_size; +  			if (key.offset > last_byte &&  			    hole_size >= num_bytes) {  				*start = last_byte; @@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,  	device->fs_devices->total_rw_bytes += diff;  	device->total_bytes = new_size; +	device->disk_total_bytes = new_size;  	btrfs_clear_space_info_full(device->dev_root->fs_info);  	return btrfs_update_device(trans, device); @@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  			goto done;  		if (ret) {  			ret = 0; -			goto done; +			break;  		}  		l = path->nodes[0]; @@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  		btrfs_item_key_to_cpu(l, &key, path->slots[0]);  		if (key.objectid != device->devid) -			goto done; +			break;  		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);  		length = btrfs_dev_extent_length(l, dev_extent); @@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			     max_chunk_size);  again: +	max_avail = 0;  	if (!map || map->num_stripes != num_stripes) {  		kfree(map);  		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2219,7 +2230,8 @@ again:  		if (device->in_fs_metadata && avail >= min_free) {  			ret = find_free_dev_extent(trans, device, -						   min_free, &dev_offset); +						   min_free, &dev_offset, +						   &max_avail);  			if (ret == 0) {  				list_move_tail(&device->dev_alloc_list,  					       &private_devs); @@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,  		}  	} -	for (i = 0; i > nr; i++) { -		struct btrfs_multi_bio *multi; -		struct btrfs_bio_stripe *stripe; -		int ret; - -		length = 1; -		ret = btrfs_map_block(map_tree, WRITE, buf[i], -				      &length, &multi, 0); -		BUG_ON(ret); - -		stripe = multi->stripes; -		for (j = 0; j < multi->num_stripes; j++) { -			if (stripe->physical >= physical && -			    physical < stripe->physical + length) -				break; -		} -		BUG_ON(j >= multi->num_stripes); -		kfree(multi); -	} -  	*logical = buf;  	*naddrs = nr;  	*stripe_len = map->stripe_len;  | 
