diff options
Diffstat (limited to 'fs/btrfs/free-space-cache.c')
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 3321 | 
1 files changed, 2241 insertions, 1080 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 22ee0dc2e6b..2b0a627cb5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,22 +20,25 @@  #include <linux/sched.h>  #include <linux/slab.h>  #include <linux/math64.h> +#include <linux/ratelimit.h>  #include "ctree.h"  #include "free-space-cache.h"  #include "transaction.h"  #include "disk-io.h" +#include "extent_io.h" +#include "inode-map.h"  #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)  #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024) -static void recalculate_thresholds(struct btrfs_block_group_cache -				   *block_group); -static int link_free_space(struct btrfs_block_group_cache *block_group, +static int link_free_space(struct btrfs_free_space_ctl *ctl,  			   struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, +			      struct btrfs_free_space *info); -struct inode *lookup_free_space_inode(struct btrfs_root *root, -				      struct btrfs_block_group_cache -				      *block_group, struct btrfs_path *path) +static struct inode *__lookup_free_space_inode(struct btrfs_root *root, +					       struct btrfs_path *path, +					       u64 offset)  {  	struct btrfs_key key;  	struct btrfs_key location; @@ -45,22 +48,15 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  	struct inode *inode = NULL;  	int ret; -	spin_lock(&block_group->lock); -	if (block_group->inode) -		inode = igrab(block_group->inode); -	spin_unlock(&block_group->lock); -	if (inode) -		return inode; -  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; -	key.offset = block_group->key.objectid; +	key.offset = offset;  	key.type = 0;  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	if (ret < 0)  		return ERR_PTR(ret);  	if (ret > 0) { -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		return ERR_PTR(-ENOENT);  	} @@ -69,7 +65,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  				struct btrfs_free_space_header);  	btrfs_free_space_key(leaf, header, &disk_key);  	btrfs_disk_key_to_cpu(&location, &disk_key); -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);  	if (!inode) @@ -81,8 +77,41 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  		return ERR_PTR(-ENOENT);  	} +	mapping_set_gfp_mask(inode->i_mapping, +			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + +	return inode; +} + +struct inode *lookup_free_space_inode(struct btrfs_root *root, +				      struct btrfs_block_group_cache +				      *block_group, struct btrfs_path *path) +{ +	struct inode *inode = NULL; +	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; +  	spin_lock(&block_group->lock); -	if (!root->fs_info->closing) { +	if (block_group->inode) +		inode = igrab(block_group->inode); +	spin_unlock(&block_group->lock); +	if (inode) +		return inode; + +	inode = __lookup_free_space_inode(root, path, +					  block_group->key.objectid); +	if (IS_ERR(inode)) +		return inode; + +	spin_lock(&block_group->lock); +	if (!((BTRFS_I(inode)->flags & flags) == flags)) { +		btrfs_info(root->fs_info, +			"Old style space inode found, converting."); +		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | +			BTRFS_INODE_NODATACOW; +		block_group->disk_cache_state = BTRFS_DC_CLEAR; +	} + +	if (!block_group->iref) {  		block_group->inode = igrab(inode);  		block_group->iref = 1;  	} @@ -91,27 +120,27 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,  	return inode;  } -int create_free_space_inode(struct btrfs_root *root, -			    struct btrfs_trans_handle *trans, -			    struct btrfs_block_group_cache *block_group, -			    struct btrfs_path *path) +static int __create_free_space_inode(struct btrfs_root *root, +				     struct btrfs_trans_handle *trans, +				     struct btrfs_path *path, +				     u64 ino, u64 offset)  {  	struct btrfs_key key;  	struct btrfs_disk_key disk_key;  	struct btrfs_free_space_header *header;  	struct btrfs_inode_item *inode_item;  	struct extent_buffer *leaf; -	u64 objectid; +	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;  	int ret; -	ret = btrfs_find_free_objectid(trans, root, 0, &objectid); -	if (ret < 0) -		return ret; - -	ret = btrfs_insert_empty_inode(trans, root, path, objectid); +	ret = btrfs_insert_empty_inode(trans, root, path, ino);  	if (ret)  		return ret; +	/* We inline crc's for the free disk space cache */ +	if (ino != BTRFS_FREE_INO_OBJECTID) +		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; +  	leaf = path->nodes[0];  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				    struct btrfs_inode_item); @@ -124,23 +153,21 @@ int create_free_space_inode(struct btrfs_root *root,  	btrfs_set_inode_uid(leaf, inode_item, 0);  	btrfs_set_inode_gid(leaf, inode_item, 0);  	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); -	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | -			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); +	btrfs_set_inode_flags(leaf, inode_item, flags);  	btrfs_set_inode_nlink(leaf, inode_item, 1);  	btrfs_set_inode_transid(leaf, inode_item, trans->transid); -	btrfs_set_inode_block_group(leaf, inode_item, -				    block_group->key.objectid); +	btrfs_set_inode_block_group(leaf, inode_item, offset);  	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; -	key.offset = block_group->key.objectid; +	key.offset = offset;  	key.type = 0;  	ret = btrfs_insert_empty_item(trans, root, path, &key,  				      sizeof(struct btrfs_free_space_header));  	if (ret < 0) { -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		return ret;  	}  	leaf = path->nodes[0]; @@ -149,29 +176,54 @@ int create_free_space_inode(struct btrfs_root *root,  	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));  	btrfs_set_free_space_key(leaf, header, &disk_key);  	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	return 0;  } +int create_free_space_inode(struct btrfs_root *root, +			    struct btrfs_trans_handle *trans, +			    struct btrfs_block_group_cache *block_group, +			    struct btrfs_path *path) +{ +	int ret; +	u64 ino; + +	ret = btrfs_find_free_objectid(root, &ino); +	if (ret < 0) +		return ret; + +	return __create_free_space_inode(root, trans, path, ino, +					 block_group->key.objectid); +} + +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, +				       struct btrfs_block_rsv *rsv) +{ +	u64 needed_bytes; +	int ret; + +	/* 1 for slack space, 1 for updating the inode */ +	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + +		btrfs_calc_trans_metadata_size(root, 1); + +	spin_lock(&rsv->lock); +	if (rsv->reserved < needed_bytes) +		ret = -ENOSPC; +	else +		ret = 0; +	spin_unlock(&rsv->lock); +	return ret; +} +  int btrfs_truncate_free_space_cache(struct btrfs_root *root,  				    struct btrfs_trans_handle *trans, -				    struct btrfs_path *path,  				    struct inode *inode)  { -	loff_t oldsize;  	int ret = 0; -	trans->block_rsv = root->orphan_block_rsv; -	ret = btrfs_block_rsv_check(trans, root, -				    root->orphan_block_rsv, -				    0, 5); -	if (ret) -		return ret; - -	oldsize = i_size_read(inode);  	btrfs_i_size_write(inode, 0); -	truncate_pagecache(inode, oldsize, 0); +	truncate_pagecache(inode, 0);  	/*  	 * We don't need an orphan item because truncating the free space cache @@ -180,11 +232,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,  	ret = btrfs_truncate_inode_items(trans, root, inode,  					 0, BTRFS_EXTENT_DATA_KEY);  	if (ret) { -		WARN_ON(1); +		btrfs_abort_transaction(trans, root, ret);  		return ret;  	} -	return btrfs_update_inode(trans, root, inode); +	ret = btrfs_update_inode(trans, root, inode); +	if (ret) +		btrfs_abort_transaction(trans, root, ret); + +	return ret;  }  static int readahead_cache(struct inode *inode) @@ -206,602 +262,1017 @@ static int readahead_cache(struct inode *inode)  	return 0;  } -int load_free_space_cache(struct btrfs_fs_info *fs_info, -			  struct btrfs_block_group_cache *block_group) +struct io_ctl { +	void *cur, *orig; +	struct page *page; +	struct page **pages; +	struct btrfs_root *root; +	unsigned long size; +	int index; +	int num_pages; +	unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +		       struct btrfs_root *root, int write) +{ +	int num_pages; +	int check_crcs = 0; + +	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> +		    PAGE_CACHE_SHIFT; + +	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) +		check_crcs = 1; + +	/* Make sure we can fit our crcs into the first page */ +	if (write && check_crcs && +	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) +		return -ENOSPC; + +	memset(io_ctl, 0, sizeof(struct io_ctl)); + +	io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); +	if (!io_ctl->pages) +		return -ENOMEM; + +	io_ctl->num_pages = num_pages; +	io_ctl->root = root; +	io_ctl->check_crcs = check_crcs; + +	return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ +	kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ +	if (io_ctl->cur) { +		kunmap(io_ctl->page); +		io_ctl->cur = NULL; +		io_ctl->orig = NULL; +	} +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ +	ASSERT(io_ctl->index < io_ctl->num_pages); +	io_ctl->page = io_ctl->pages[io_ctl->index++]; +	io_ctl->cur = kmap(io_ctl->page); +	io_ctl->orig = io_ctl->cur; +	io_ctl->size = PAGE_CACHE_SIZE; +	if (clear) +		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ +	int i; + +	io_ctl_unmap_page(io_ctl); + +	for (i = 0; i < io_ctl->num_pages; i++) { +		if (io_ctl->pages[i]) { +			ClearPageChecked(io_ctl->pages[i]); +			unlock_page(io_ctl->pages[i]); +			page_cache_release(io_ctl->pages[i]); +		} +	} +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +				int uptodate)  { -	struct btrfs_root *root = fs_info->tree_root; -	struct inode *inode; -	struct btrfs_free_space_header *header; -	struct extent_buffer *leaf;  	struct page *page; -	struct btrfs_path *path; -	u32 *checksums = NULL, *crc; -	char *disk_crcs = NULL; -	struct btrfs_key key; -	struct list_head bitmaps; -	u64 num_entries; -	u64 num_bitmaps; -	u64 generation; -	u32 cur_crc = ~(u32)0; -	pgoff_t index = 0; -	unsigned long first_page_offset; -	int num_checksums; -	int ret = 0; +	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); +	int i; + +	for (i = 0; i < io_ctl->num_pages; i++) { +		page = find_or_create_page(inode->i_mapping, i, mask); +		if (!page) { +			io_ctl_drop_pages(io_ctl); +			return -ENOMEM; +		} +		io_ctl->pages[i] = page; +		if (uptodate && !PageUptodate(page)) { +			btrfs_readpage(NULL, page); +			lock_page(page); +			if (!PageUptodate(page)) { +				btrfs_err(BTRFS_I(inode)->root->fs_info, +					   "error reading free space cache"); +				io_ctl_drop_pages(io_ctl); +				return -EIO; +			} +		} +	} + +	for (i = 0; i < io_ctl->num_pages; i++) { +		clear_page_dirty_for_io(io_ctl->pages[i]); +		set_page_extent_mapped(io_ctl->pages[i]); +	} + +	return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ +	__le64 *val; + +	io_ctl_map_page(io_ctl, 1);  	/* -	 * If we're unmounting then just return, since this does a search on the -	 * normal root and not the commit root and we could deadlock. +	 * Skip the csum areas.  If we don't check crcs then we just have a +	 * 64bit chunk at the front of the first page.  	 */ -	smp_mb(); -	if (fs_info->closing) -		return 0; +	if (io_ctl->check_crcs) { +		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); +		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); +	} else { +		io_ctl->cur += sizeof(u64); +		io_ctl->size -= sizeof(u64) * 2; +	} + +	val = io_ctl->cur; +	*val = cpu_to_le64(generation); +	io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ +	__le64 *gen;  	/* -	 * If this block group has been marked to be cleared for one reason or -	 * another then we can't trust the on disk cache, so just return. +	 * Skip the crc area.  If we don't check crcs then we just have a 64bit +	 * chunk at the front of the first page.  	 */ -	spin_lock(&block_group->lock); -	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { -		spin_unlock(&block_group->lock); +	if (io_ctl->check_crcs) { +		io_ctl->cur += sizeof(u32) * io_ctl->num_pages; +		io_ctl->size -= sizeof(u64) + +			(sizeof(u32) * io_ctl->num_pages); +	} else { +		io_ctl->cur += sizeof(u64); +		io_ctl->size -= sizeof(u64) * 2; +	} + +	gen = io_ctl->cur; +	if (le64_to_cpu(*gen) != generation) { +		printk_ratelimited(KERN_ERR "BTRFS: space cache generation " +				   "(%Lu) does not match inode (%Lu)\n", *gen, +				   generation); +		io_ctl_unmap_page(io_ctl); +		return -EIO; +	} +	io_ctl->cur += sizeof(u64); +	return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ +	u32 *tmp; +	u32 crc = ~(u32)0; +	unsigned offset = 0; + +	if (!io_ctl->check_crcs) { +		io_ctl_unmap_page(io_ctl); +		return; +	} + +	if (index == 0) +		offset = sizeof(u32) * io_ctl->num_pages; + +	crc = btrfs_csum_data(io_ctl->orig + offset, crc, +			      PAGE_CACHE_SIZE - offset); +	btrfs_csum_final(crc, (char *)&crc); +	io_ctl_unmap_page(io_ctl); +	tmp = kmap(io_ctl->pages[0]); +	tmp += index; +	*tmp = crc; +	kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ +	u32 *tmp, val; +	u32 crc = ~(u32)0; +	unsigned offset = 0; + +	if (!io_ctl->check_crcs) { +		io_ctl_map_page(io_ctl, 0);  		return 0;  	} -	spin_unlock(&block_group->lock); -	INIT_LIST_HEAD(&bitmaps); +	if (index == 0) +		offset = sizeof(u32) * io_ctl->num_pages; + +	tmp = kmap(io_ctl->pages[0]); +	tmp += index; +	val = *tmp; +	kunmap(io_ctl->pages[0]); + +	io_ctl_map_page(io_ctl, 0); +	crc = btrfs_csum_data(io_ctl->orig + offset, crc, +			      PAGE_CACHE_SIZE - offset); +	btrfs_csum_final(crc, (char *)&crc); +	if (val != crc) { +		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " +				   "space cache\n"); +		io_ctl_unmap_page(io_ctl); +		return -EIO; +	} -	path = btrfs_alloc_path(); -	if (!path) +	return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +			    void *bitmap) +{ +	struct btrfs_free_space_entry *entry; + +	if (!io_ctl->cur) +		return -ENOSPC; + +	entry = io_ctl->cur; +	entry->offset = cpu_to_le64(offset); +	entry->bytes = cpu_to_le64(bytes); +	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : +		BTRFS_FREE_SPACE_EXTENT; +	io_ctl->cur += sizeof(struct btrfs_free_space_entry); +	io_ctl->size -= sizeof(struct btrfs_free_space_entry); + +	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))  		return 0; -	inode = lookup_free_space_inode(root, block_group, path); -	if (IS_ERR(inode)) { -		btrfs_free_path(path); +	io_ctl_set_crc(io_ctl, io_ctl->index - 1); + +	/* No more pages to map */ +	if (io_ctl->index >= io_ctl->num_pages)  		return 0; + +	/* map the next page */ +	io_ctl_map_page(io_ctl, 1); +	return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ +	if (!io_ctl->cur) +		return -ENOSPC; + +	/* +	 * If we aren't at the start of the current page, unmap this one and +	 * map the next one if there is any left. +	 */ +	if (io_ctl->cur != io_ctl->orig) { +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +		if (io_ctl->index >= io_ctl->num_pages) +			return -ENOSPC; +		io_ctl_map_page(io_ctl, 0);  	} -	/* Nothing in the space cache, goodbye */ -	if (!i_size_read(inode)) { -		btrfs_free_path(path); -		goto out; +	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); +	io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	if (io_ctl->index < io_ctl->num_pages) +		io_ctl_map_page(io_ctl, 0); +	return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ +	/* +	 * If we're not on the boundary we know we've modified the page and we +	 * need to crc the page. +	 */ +	if (io_ctl->cur != io_ctl->orig) +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	else +		io_ctl_unmap_page(io_ctl); + +	while (io_ctl->index < io_ctl->num_pages) { +		io_ctl_map_page(io_ctl, 1); +		io_ctl_set_crc(io_ctl, io_ctl->index - 1); +	} +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, +			    struct btrfs_free_space *entry, u8 *type) +{ +	struct btrfs_free_space_entry *e; +	int ret; + +	if (!io_ctl->cur) { +		ret = io_ctl_check_crc(io_ctl, io_ctl->index); +		if (ret) +			return ret;  	} +	e = io_ctl->cur; +	entry->offset = le64_to_cpu(e->offset); +	entry->bytes = le64_to_cpu(e->bytes); +	*type = e->type; +	io_ctl->cur += sizeof(struct btrfs_free_space_entry); +	io_ctl->size -= sizeof(struct btrfs_free_space_entry); + +	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) +		return 0; + +	io_ctl_unmap_page(io_ctl); + +	return 0; +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +			      struct btrfs_free_space *entry) +{ +	int ret; + +	ret = io_ctl_check_crc(io_ctl, io_ctl->index); +	if (ret) +		return ret; + +	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); +	io_ctl_unmap_page(io_ctl); + +	return 0; +} + +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries.  This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache.  So run through the space cache that we just + * loaded and merge contiguous entries.  This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ +	struct btrfs_free_space *e, *prev = NULL; +	struct rb_node *n; + +again: +	spin_lock(&ctl->tree_lock); +	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { +		e = rb_entry(n, struct btrfs_free_space, offset_index); +		if (!prev) +			goto next; +		if (e->bitmap || prev->bitmap) +			goto next; +		if (prev->offset + prev->bytes == e->offset) { +			unlink_free_space(ctl, prev); +			unlink_free_space(ctl, e); +			prev->bytes += e->bytes; +			kmem_cache_free(btrfs_free_space_cachep, e); +			link_free_space(ctl, prev); +			prev = NULL; +			spin_unlock(&ctl->tree_lock); +			goto again; +		} +next: +		prev = e; +	} +	spin_unlock(&ctl->tree_lock); +} + +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, +				   struct btrfs_free_space_ctl *ctl, +				   struct btrfs_path *path, u64 offset) +{ +	struct btrfs_free_space_header *header; +	struct extent_buffer *leaf; +	struct io_ctl io_ctl; +	struct btrfs_key key; +	struct btrfs_free_space *e, *n; +	struct list_head bitmaps; +	u64 num_entries; +	u64 num_bitmaps; +	u64 generation; +	u8 type; +	int ret = 0; + +	INIT_LIST_HEAD(&bitmaps); + +	/* Nothing in the space cache, goodbye */ +	if (!i_size_read(inode)) +		return 0; +  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; -	key.offset = block_group->key.objectid; +	key.offset = offset;  	key.type = 0;  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -	if (ret) { -		btrfs_free_path(path); -		goto out; +	if (ret < 0) +		return 0; +	else if (ret > 0) { +		btrfs_release_path(path); +		return 0;  	} +	ret = -1; +  	leaf = path->nodes[0];  	header = btrfs_item_ptr(leaf, path->slots[0],  				struct btrfs_free_space_header);  	num_entries = btrfs_free_space_entries(leaf, header);  	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);  	generation = btrfs_free_space_generation(leaf, header); -	btrfs_free_path(path); +	btrfs_release_path(path); + +	if (!BTRFS_I(inode)->generation) { +		btrfs_info(root->fs_info, +			   "The free space cache file (%llu) is invalid. skip it\n", +			   offset); +		return 0; +	}  	if (BTRFS_I(inode)->generation != generation) { -		printk(KERN_ERR "btrfs: free space inode generation (%llu) did" -		       " not match free space cache generation (%llu) for " -		       "block group %llu\n", -		       (unsigned long long)BTRFS_I(inode)->generation, -		       (unsigned long long)generation, -		       (unsigned long long)block_group->key.objectid); -		goto out; +		btrfs_err(root->fs_info, +			"free space inode generation (%llu) " +			"did not match free space cache generation (%llu)", +			BTRFS_I(inode)->generation, generation); +		return 0;  	}  	if (!num_entries) -		goto out; +		return 0; -	/* Setup everything for doing checksumming */ -	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; -	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); -	if (!checksums) -		goto out; -	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); -	disk_crcs = kzalloc(first_page_offset, GFP_NOFS); -	if (!disk_crcs) -		goto out; +	ret = io_ctl_init(&io_ctl, inode, root, 0); +	if (ret) +		return ret;  	ret = readahead_cache(inode); -	if (ret) { -		ret = 0; +	if (ret)  		goto out; -	} -	while (1) { -		struct btrfs_free_space_entry *entry; -		struct btrfs_free_space *e; -		void *addr; -		unsigned long offset = 0; -		unsigned long start_offset = 0; -		int need_loop = 0; +	ret = io_ctl_prepare_pages(&io_ctl, inode, 1); +	if (ret) +		goto out; -		if (!num_entries && !num_bitmaps) -			break; +	ret = io_ctl_check_crc(&io_ctl, 0); +	if (ret) +		goto free_cache; -		if (index == 0) { -			start_offset = first_page_offset; -			offset = start_offset; -		} +	ret = io_ctl_check_generation(&io_ctl, generation); +	if (ret) +		goto free_cache; -		page = grab_cache_page(inode->i_mapping, index); -		if (!page) { -			ret = 0; +	while (num_entries) { +		e = kmem_cache_zalloc(btrfs_free_space_cachep, +				      GFP_NOFS); +		if (!e)  			goto free_cache; -		} -		if (!PageUptodate(page)) { -			btrfs_readpage(NULL, page); -			lock_page(page); -			if (!PageUptodate(page)) { -				unlock_page(page); -				page_cache_release(page); -				printk(KERN_ERR "btrfs: error reading free " -				       "space cache: %llu\n", -				       (unsigned long long) -				       block_group->key.objectid); -				goto free_cache; -			} -		} -		addr = kmap(page); - -		if (index == 0) { -			u64 *gen; - -			memcpy(disk_crcs, addr, first_page_offset); -			gen = addr + (sizeof(u32) * num_checksums); -			if (*gen != BTRFS_I(inode)->generation) { -				printk(KERN_ERR "btrfs: space cache generation" -				       " (%llu) does not match inode (%llu) " -				       "for block group %llu\n", -				       (unsigned long long)*gen, -				       (unsigned long long) -				       BTRFS_I(inode)->generation, -				       (unsigned long long) -				       block_group->key.objectid); -				kunmap(page); -				unlock_page(page); -				page_cache_release(page); -				goto free_cache; -			} -			crc = (u32 *)disk_crcs; -		} -		entry = addr + start_offset; - -		/* First lets check our crc before we do anything fun */ -		cur_crc = ~(u32)0; -		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, -					  PAGE_CACHE_SIZE - start_offset); -		btrfs_csum_final(cur_crc, (char *)&cur_crc); -		if (cur_crc != *crc) { -			printk(KERN_ERR "btrfs: crc mismatch for page %lu in " -			       "block group %llu\n", index, -			       (unsigned long long)block_group->key.objectid); -			kunmap(page); -			unlock_page(page); -			page_cache_release(page); +		ret = io_ctl_read_entry(&io_ctl, e, &type); +		if (ret) { +			kmem_cache_free(btrfs_free_space_cachep, e);  			goto free_cache;  		} -		crc++; -		while (1) { -			if (!num_entries) -				break; +		if (!e->bytes) { +			kmem_cache_free(btrfs_free_space_cachep, e); +			goto free_cache; +		} -			need_loop = 1; -			e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); -			if (!e) { -				kunmap(page); -				unlock_page(page); -				page_cache_release(page); +		if (type == BTRFS_FREE_SPACE_EXTENT) { +			spin_lock(&ctl->tree_lock); +			ret = link_free_space(ctl, e); +			spin_unlock(&ctl->tree_lock); +			if (ret) { +				btrfs_err(root->fs_info, +					"Duplicate entries in free space cache, dumping"); +				kmem_cache_free(btrfs_free_space_cachep, e);  				goto free_cache;  			} - -			e->offset = le64_to_cpu(entry->offset); -			e->bytes = le64_to_cpu(entry->bytes); -			if (!e->bytes) { -				kunmap(page); -				kfree(e); -				unlock_page(page); -				page_cache_release(page); +		} else { +			ASSERT(num_bitmaps); +			num_bitmaps--; +			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); +			if (!e->bitmap) { +				kmem_cache_free( +					btrfs_free_space_cachep, e);  				goto free_cache;  			} - -			if (entry->type == BTRFS_FREE_SPACE_EXTENT) { -				spin_lock(&block_group->tree_lock); -				ret = link_free_space(block_group, e); -				spin_unlock(&block_group->tree_lock); -				BUG_ON(ret); -			} else { -				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); -				if (!e->bitmap) { -					kunmap(page); -					kfree(e); -					unlock_page(page); -					page_cache_release(page); -					goto free_cache; -				} -				spin_lock(&block_group->tree_lock); -				ret = link_free_space(block_group, e); -				block_group->total_bitmaps++; -				recalculate_thresholds(block_group); -				spin_unlock(&block_group->tree_lock); -				list_add_tail(&e->list, &bitmaps); +			spin_lock(&ctl->tree_lock); +			ret = link_free_space(ctl, e); +			ctl->total_bitmaps++; +			ctl->op->recalc_thresholds(ctl); +			spin_unlock(&ctl->tree_lock); +			if (ret) { +				btrfs_err(root->fs_info, +					"Duplicate entries in free space cache, dumping"); +				kmem_cache_free(btrfs_free_space_cachep, e); +				goto free_cache;  			} - -			num_entries--; -			offset += sizeof(struct btrfs_free_space_entry); -			if (offset + sizeof(struct btrfs_free_space_entry) >= -			    PAGE_CACHE_SIZE) -				break; -			entry++; +			list_add_tail(&e->list, &bitmaps);  		} -		/* -		 * We read an entry out of this page, we need to move on to the -		 * next page. -		 */ -		if (need_loop) { -			kunmap(page); -			goto next; -		} +		num_entries--; +	} -		/* -		 * We add the bitmaps at the end of the entries in order that -		 * the bitmap entries are added to the cache. -		 */ -		e = list_entry(bitmaps.next, struct btrfs_free_space, list); +	io_ctl_unmap_page(&io_ctl); + +	/* +	 * We add the bitmaps at the end of the entries in order that +	 * the bitmap entries are added to the cache. +	 */ +	list_for_each_entry_safe(e, n, &bitmaps, list) {  		list_del_init(&e->list); -		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); -		kunmap(page); -		num_bitmaps--; -next: -		unlock_page(page); -		page_cache_release(page); -		index++; +		ret = io_ctl_read_bitmap(&io_ctl, e); +		if (ret) +			goto free_cache;  	} +	io_ctl_drop_pages(&io_ctl); +	merge_space_tree(ctl);  	ret = 1;  out: -	kfree(checksums); -	kfree(disk_crcs); -	iput(inode); +	io_ctl_free(&io_ctl);  	return ret; -  free_cache: -	/* This cache is bogus, make sure it gets cleared */ -	spin_lock(&block_group->lock); -	block_group->disk_cache_state = BTRFS_DC_CLEAR; -	spin_unlock(&block_group->lock); -	btrfs_remove_free_space_cache(block_group); +	io_ctl_drop_pages(&io_ctl); +	__btrfs_remove_free_space_cache(ctl);  	goto out;  } -int btrfs_write_out_cache(struct btrfs_root *root, -			  struct btrfs_trans_handle *trans, -			  struct btrfs_block_group_cache *block_group, -			  struct btrfs_path *path) +int load_free_space_cache(struct btrfs_fs_info *fs_info, +			  struct btrfs_block_group_cache *block_group)  { -	struct btrfs_free_space_header *header; -	struct extent_buffer *leaf; +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_root *root = fs_info->tree_root;  	struct inode *inode; -	struct rb_node *node; -	struct list_head *pos, *n; -	struct page *page; -	struct extent_state *cached_state = NULL; -	struct list_head bitmap_list; -	struct btrfs_key key; -	u64 bytes = 0; -	u32 *crc, *checksums; -	pgoff_t index = 0, last_index = 0; -	unsigned long first_page_offset; -	int num_checksums; -	int entries = 0; -	int bitmaps = 0; +	struct btrfs_path *path;  	int ret = 0; +	bool matched; +	u64 used = btrfs_block_group_used(&block_group->item); -	root = root->fs_info->tree_root; - -	INIT_LIST_HEAD(&bitmap_list); - +	/* +	 * If this block group has been marked to be cleared for one reason or +	 * another then we can't trust the on disk cache, so just return. +	 */  	spin_lock(&block_group->lock); -	if (block_group->disk_cache_state < BTRFS_DC_SETUP) { +	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {  		spin_unlock(&block_group->lock);  		return 0;  	}  	spin_unlock(&block_group->lock); -	inode = lookup_free_space_inode(root, block_group, path); -	if (IS_ERR(inode)) +	path = btrfs_alloc_path(); +	if (!path)  		return 0; +	path->search_commit_root = 1; +	path->skip_locking = 1; -	if (!i_size_read(inode)) { -		iput(inode); +	inode = lookup_free_space_inode(root, block_group, path); +	if (IS_ERR(inode)) { +		btrfs_free_path(path);  		return 0;  	} -	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; -	filemap_write_and_wait(inode->i_mapping); -	btrfs_wait_ordered_range(inode, inode->i_size & -				 ~(root->sectorsize - 1), (u64)-1); - -	/* We need a checksum per page. */ -	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; -	crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); -	if (!crc) { -		iput(inode); -		return 0; +	/* We may have converted the inode and made the cache invalid. */ +	spin_lock(&block_group->lock); +	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { +		spin_unlock(&block_group->lock); +		btrfs_free_path(path); +		goto out;  	} +	spin_unlock(&block_group->lock); -	/* Since the first page has all of our checksums and our generation we -	 * need to calculate the offset into the page that we can start writing -	 * our entries. -	 */ -	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); +	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, +				      path, block_group->key.objectid); +	btrfs_free_path(path); +	if (ret <= 0) +		goto out; -	node = rb_first(&block_group->free_space_offset); -	if (!node) -		goto out_free; +	spin_lock(&ctl->tree_lock); +	matched = (ctl->free_space == (block_group->key.offset - used - +				       block_group->bytes_super)); +	spin_unlock(&ctl->tree_lock); -	/* -	 * Lock all pages first so we can lock the extent safely. -	 * -	 * NOTE: Because we hold the ref the entire time we're going to write to -	 * the page find_get_page should never fail, so we don't do a check -	 * after find_get_page at this point.  Just putting this here so people -	 * know and don't freak out. -	 */ -	while (index <= last_index) { -		page = grab_cache_page(inode->i_mapping, index); -		if (!page) { -			pgoff_t i = 0; - -			while (i < index) { -				page = find_get_page(inode->i_mapping, i); -				unlock_page(page); -				page_cache_release(page); -				page_cache_release(page); -				i++; -			} -			goto out_free; -		} -		index++; +	if (!matched) { +		__btrfs_remove_free_space_cache(ctl); +		btrfs_warn(fs_info, "block group %llu has wrong amount of free space", +			block_group->key.objectid); +		ret = -1;  	} +out: +	if (ret < 0) { +		/* This cache is bogus, make sure it gets cleared */ +		spin_lock(&block_group->lock); +		block_group->disk_cache_state = BTRFS_DC_CLEAR; +		spin_unlock(&block_group->lock); +		ret = 0; -	index = 0; -	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, -			 0, &cached_state, GFP_NOFS); - -	/* Write out the extent entries */ -	do { -		struct btrfs_free_space_entry *entry; -		void *addr; -		unsigned long offset = 0; -		unsigned long start_offset = 0; - -		if (index == 0) { -			start_offset = first_page_offset; -			offset = start_offset; -		} - -		page = find_get_page(inode->i_mapping, index); +		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", +			block_group->key.objectid); +	} -		addr = kmap(page); -		entry = addr + start_offset; +	iput(inode); +	return ret; +} -		memset(addr, 0, PAGE_CACHE_SIZE); -		while (1) { -			struct btrfs_free_space *e; +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, +			      struct btrfs_free_space_ctl *ctl, +			      struct btrfs_block_group_cache *block_group, +			      int *entries, int *bitmaps, +			      struct list_head *bitmap_list) +{ +	int ret; +	struct btrfs_free_cluster *cluster = NULL; +	struct rb_node *node = rb_first(&ctl->free_space_offset); -			e = rb_entry(node, struct btrfs_free_space, offset_index); -			entries++; +	/* Get the cluster for this block_group if it exists */ +	if (block_group && !list_empty(&block_group->cluster_list)) { +		cluster = list_entry(block_group->cluster_list.next, +				     struct btrfs_free_cluster, +				     block_group_list); +	} -			entry->offset = cpu_to_le64(e->offset); -			entry->bytes = cpu_to_le64(e->bytes); -			if (e->bitmap) { -				entry->type = BTRFS_FREE_SPACE_BITMAP; -				list_add_tail(&e->list, &bitmap_list); -				bitmaps++; -			} else { -				entry->type = BTRFS_FREE_SPACE_EXTENT; -			} -			node = rb_next(node); -			if (!node) -				break; -			offset += sizeof(struct btrfs_free_space_entry); -			if (offset + sizeof(struct btrfs_free_space_entry) >= -			    PAGE_CACHE_SIZE) -				break; -			entry++; -		} -		*crc = ~(u32)0; -		*crc = btrfs_csum_data(root, addr + start_offset, *crc, -				       PAGE_CACHE_SIZE - start_offset); -		kunmap(page); +	if (!node && cluster) { +		node = rb_first(&cluster->root); +		cluster = NULL; +	} -		btrfs_csum_final(*crc, (char *)crc); -		crc++; +	/* Write out the extent entries */ +	while (node) { +		struct btrfs_free_space *e; -		bytes += PAGE_CACHE_SIZE; +		e = rb_entry(node, struct btrfs_free_space, offset_index); +		*entries += 1; -		ClearPageChecked(page); -		set_page_extent_mapped(page); -		SetPageUptodate(page); -		set_page_dirty(page); +		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, +				       e->bitmap); +		if (ret) +			goto fail; -		/* -		 * We need to release our reference we got for grab_cache_page, -		 * except for the first page which will hold our checksums, we -		 * do that below. -		 */ -		if (index != 0) { -			unlock_page(page); -			page_cache_release(page); +		if (e->bitmap) { +			list_add_tail(&e->list, bitmap_list); +			*bitmaps += 1; +		} +		node = rb_next(node); +		if (!node && cluster) { +			node = rb_first(&cluster->root); +			cluster = NULL;  		} - -		page_cache_release(page); - -		index++; -	} while (node); - -	/* Write out the bitmaps */ -	list_for_each_safe(pos, n, &bitmap_list) { -		void *addr; -		struct btrfs_free_space *entry = -			list_entry(pos, struct btrfs_free_space, list); - -		page = find_get_page(inode->i_mapping, index); - -		addr = kmap(page); -		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); -		*crc = ~(u32)0; -		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); -		kunmap(page); -		btrfs_csum_final(*crc, (char *)crc); -		crc++; -		bytes += PAGE_CACHE_SIZE; - -		ClearPageChecked(page); -		set_page_extent_mapped(page); -		SetPageUptodate(page); -		set_page_dirty(page); -		unlock_page(page); -		page_cache_release(page); -		page_cache_release(page); -		list_del_init(&entry->list); -		index++; -	} - -	/* Zero out the rest of the pages just to make sure */ -	while (index <= last_index) { -		void *addr; - -		page = find_get_page(inode->i_mapping, index); - -		addr = kmap(page); -		memset(addr, 0, PAGE_CACHE_SIZE); -		kunmap(page); -		ClearPageChecked(page); -		set_page_extent_mapped(page); -		SetPageUptodate(page); -		set_page_dirty(page); -		unlock_page(page); -		page_cache_release(page); -		page_cache_release(page); -		bytes += PAGE_CACHE_SIZE; -		index++; -	} - -	btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); - -	/* Write the checksums and trans id to the first page */ -	{ -		void *addr; -		u64 *gen; - -		page = find_get_page(inode->i_mapping, 0); - -		addr = kmap(page); -		memcpy(addr, checksums, sizeof(u32) * num_checksums); -		gen = addr + (sizeof(u32) * num_checksums); -		*gen = trans->transid; -		kunmap(page); -		ClearPageChecked(page); -		set_page_extent_mapped(page); -		SetPageUptodate(page); -		set_page_dirty(page); -		unlock_page(page); -		page_cache_release(page); -		page_cache_release(page);  	} -	BTRFS_I(inode)->generation = trans->transid; - -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, -			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); +	return 0; +fail: +	return -ENOSPC; +} -	filemap_write_and_wait(inode->i_mapping); +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, +		  struct btrfs_root *root, +		  struct inode *inode, +		  struct btrfs_path *path, u64 offset, +		  int entries, int bitmaps) +{ +	struct btrfs_key key; +	struct btrfs_free_space_header *header; +	struct extent_buffer *leaf; +	int ret;  	key.objectid = BTRFS_FREE_SPACE_OBJECTID; -	key.offset = block_group->key.objectid; +	key.offset = offset;  	key.type = 0; -	ret = btrfs_search_slot(trans, root, &key, path, 1, 1); +	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);  	if (ret < 0) { -		ret = 0; -		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, -				 EXTENT_DIRTY | EXTENT_DELALLOC | -				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); -		goto out_free; +		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, +				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, +				 GFP_NOFS); +		goto fail;  	}  	leaf = path->nodes[0];  	if (ret > 0) {  		struct btrfs_key found_key; -		BUG_ON(!path->slots[0]); +		ASSERT(path->slots[0]);  		path->slots[0]--;  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);  		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || -		    found_key.offset != block_group->key.objectid) { -			ret = 0; -			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, -					 EXTENT_DIRTY | EXTENT_DELALLOC | -					 EXTENT_DO_ACCOUNTING, 0, 0, NULL, -					 GFP_NOFS); -			btrfs_release_path(root, path); -			goto out_free; +		    found_key.offset != offset) { +			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, +					 inode->i_size - 1, +					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, +					 NULL, GFP_NOFS); +			btrfs_release_path(path); +			goto fail;  		}  	} + +	BTRFS_I(inode)->generation = trans->transid;  	header = btrfs_item_ptr(leaf, path->slots[0],  				struct btrfs_free_space_header);  	btrfs_set_free_space_entries(leaf, header, entries);  	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);  	btrfs_set_free_space_generation(leaf, header, trans->transid);  	btrfs_mark_buffer_dirty(leaf); -	btrfs_release_path(root, path); +	btrfs_release_path(path); -	ret = 1; +	return 0; + +fail: +	return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, +			    struct btrfs_block_group_cache *block_group, +			    struct io_ctl *io_ctl, +			    int *entries) +{ +	u64 start, extent_start, extent_end, len; +	struct extent_io_tree *unpin = NULL; +	int ret; + +	if (!block_group) +		return 0; + +	/* +	 * We want to add any pinned extents to our free space cache +	 * so we don't leak the space +	 * +	 * We shouldn't have switched the pinned extents yet so this is the +	 * right one +	 */ +	unpin = root->fs_info->pinned_extents; + +	start = block_group->key.objectid; + +	while (start < block_group->key.objectid + block_group->key.offset) { +		ret = find_first_extent_bit(unpin, start, +					    &extent_start, &extent_end, +					    EXTENT_DIRTY, NULL); +		if (ret) +			return 0; + +		/* This pinned extent is out of our range */ +		if (extent_start >= block_group->key.objectid + +		    block_group->key.offset) +			return 0; + +		extent_start = max(extent_start, start); +		extent_end = min(block_group->key.objectid + +				 block_group->key.offset, extent_end + 1); +		len = extent_end - extent_start; + +		*entries += 1; +		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); +		if (ret) +			return -ENOSPC; + +		start = extent_end; +	} + +	return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +{ +	struct list_head *pos, *n; +	int ret; + +	/* Write out the bitmaps */ +	list_for_each_safe(pos, n, bitmap_list) { +		struct btrfs_free_space *entry = +			list_entry(pos, struct btrfs_free_space, list); + +		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); +		if (ret) +			return -ENOSPC; +		list_del_init(&entry->list); +	} + +	return 0; +} + +static int flush_dirty_cache(struct inode *inode) +{ +	int ret; + +	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); +	if (ret) +		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, +				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, +				 GFP_NOFS); + +	return ret; +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, +			   struct io_ctl *io_ctl, +			   struct extent_state **cached_state, +			   struct list_head *bitmap_list) +{ +	struct list_head *pos, *n; + +	list_for_each_safe(pos, n, bitmap_list) { +		struct btrfs_free_space *entry = +			list_entry(pos, struct btrfs_free_space, list); +		list_del_init(&entry->list); +	} +	io_ctl_drop_pages(io_ctl); +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, +			     i_size_read(inode) - 1, cached_state, +			     GFP_NOFS); +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount.  This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +				   struct btrfs_free_space_ctl *ctl, +				   struct btrfs_block_group_cache *block_group, +				   struct btrfs_trans_handle *trans, +				   struct btrfs_path *path, u64 offset) +{ +	struct extent_state *cached_state = NULL; +	struct io_ctl io_ctl; +	LIST_HEAD(bitmap_list); +	int entries = 0; +	int bitmaps = 0; +	int ret; -out_free: -	if (ret == 0) { -		invalidate_inode_pages2_range(inode->i_mapping, 0, index); +	if (!i_size_read(inode)) +		return -1; + +	ret = io_ctl_init(&io_ctl, inode, root, 1); +	if (ret) +		return -1; + +	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { +		down_write(&block_group->data_rwsem);  		spin_lock(&block_group->lock); -		block_group->disk_cache_state = BTRFS_DC_ERROR; +		if (block_group->delalloc_bytes) { +			block_group->disk_cache_state = BTRFS_DC_WRITTEN; +			spin_unlock(&block_group->lock); +			up_write(&block_group->data_rwsem); +			BTRFS_I(inode)->generation = 0; +			ret = 0; +			goto out; +		}  		spin_unlock(&block_group->lock); +	} + +	/* Lock all pages first so we can lock the extent safely. */ +	io_ctl_prepare_pages(&io_ctl, inode, 0); + +	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, +			 0, &cached_state); + +	io_ctl_set_generation(&io_ctl, trans->transid); + +	/* Write out the extent entries in the free space cache */ +	ret = write_cache_extent_entries(&io_ctl, ctl, +					 block_group, &entries, &bitmaps, +					 &bitmap_list); +	if (ret) +		goto out_nospc; + +	/* +	 * Some spaces that are freed in the current transaction are pinned, +	 * they will be added into free space cache after the transaction is +	 * committed, we shouldn't lose them. +	 */ +	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); +	if (ret) +		goto out_nospc; + +	/* At last, we write out all the bitmaps. */ +	ret = write_bitmap_entries(&io_ctl, &bitmap_list); +	if (ret) +		goto out_nospc; + +	/* Zero out the rest of the pages just to make sure */ +	io_ctl_zero_remaining_pages(&io_ctl); + +	/* Everything is written out, now we dirty the pages in the file. */ +	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, +				0, i_size_read(inode), &cached_state); +	if (ret) +		goto out_nospc; + +	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) +		up_write(&block_group->data_rwsem); +	/* +	 * Release the pages and unlock the extent, we will flush +	 * them out later +	 */ +	io_ctl_drop_pages(&io_ctl); + +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, +			     i_size_read(inode) - 1, &cached_state, GFP_NOFS); + +	/* Flush the dirty pages in the cache file. */ +	ret = flush_dirty_cache(inode); +	if (ret) +		goto out; + +	/* Update the cache item to tell everyone this cache file is valid. */ +	ret = update_cache_item(trans, root, inode, path, offset, +				entries, bitmaps); +out: +	io_ctl_free(&io_ctl); +	if (ret) { +		invalidate_inode_pages2(inode->i_mapping);  		BTRFS_I(inode)->generation = 0;  	} -	kfree(checksums);  	btrfs_update_inode(trans, root, inode); +	return ret; + +out_nospc: +	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + +	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) +		up_write(&block_group->data_rwsem); + +	goto out; +} + +int btrfs_write_out_cache(struct btrfs_root *root, +			  struct btrfs_trans_handle *trans, +			  struct btrfs_block_group_cache *block_group, +			  struct btrfs_path *path) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct inode *inode; +	int ret = 0; + +	root = root->fs_info->tree_root; + +	spin_lock(&block_group->lock); +	if (block_group->disk_cache_state < BTRFS_DC_SETUP) { +		spin_unlock(&block_group->lock); +		return 0; +	} + +	if (block_group->delalloc_bytes) { +		block_group->disk_cache_state = BTRFS_DC_WRITTEN; +		spin_unlock(&block_group->lock); +		return 0; +	} +	spin_unlock(&block_group->lock); + +	inode = lookup_free_space_inode(root, block_group, path); +	if (IS_ERR(inode)) +		return 0; + +	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, +				      path, block_group->key.objectid); +	if (ret) { +		spin_lock(&block_group->lock); +		block_group->disk_cache_state = BTRFS_DC_ERROR; +		spin_unlock(&block_group->lock); +		ret = 0; +#ifdef DEBUG +		btrfs_err(root->fs_info, +			"failed to write free space cache for block group %llu", +			block_group->key.objectid); +#endif +	} +  	iput(inode);  	return ret;  } -static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, +static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,  					  u64 offset)  { -	BUG_ON(offset < bitmap_start); +	ASSERT(offset >= bitmap_start);  	offset -= bitmap_start; -	return (unsigned long)(div64_u64(offset, sectorsize)); +	return (unsigned long)(div_u64(offset, unit));  } -static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)  { -	return (unsigned long)(div64_u64(bytes, sectorsize)); +	return (unsigned long)(div_u64(bytes, unit));  } -static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, +static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,  				   u64 offset)  {  	u64 bitmap_start;  	u64 bytes_per_bitmap; -	bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; -	bitmap_start = offset - block_group->key.objectid; +	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; +	bitmap_start = offset - ctl->start;  	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);  	bitmap_start *= bytes_per_bitmap; -	bitmap_start += block_group->key.objectid; +	bitmap_start += ctl->start;  	return bitmap_start;  } @@ -836,10 +1307,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,  			 * logically.  			 */  			if (bitmap) { -				WARN_ON(info->bitmap); +				if (info->bitmap) { +					WARN_ON_ONCE(1); +					return -EEXIST; +				}  				p = &(*p)->rb_right;  			} else { -				WARN_ON(!info->bitmap); +				if (!info->bitmap) { +					WARN_ON_ONCE(1); +					return -EEXIST; +				}  				p = &(*p)->rb_left;  			}  		} @@ -859,10 +1336,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,   * offset.   */  static struct btrfs_free_space * -tree_search_offset(struct btrfs_block_group_cache *block_group, +tree_search_offset(struct btrfs_free_space_ctl *ctl,  		   u64 offset, int bitmap_only, int fuzzy)  { -	struct rb_node *n = block_group->free_space_offset.rb_node; +	struct rb_node *n = ctl->free_space_offset.rb_node;  	struct btrfs_free_space *entry, *prev = NULL;  	/* find entry that is closest to the 'offset' */ @@ -908,18 +1385,13 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,  			 * if previous extent entry covers the offset,  			 * we should return it instead of the bitmap entry  			 */ -			n = &entry->offset_index; -			while (1) { -				n = rb_prev(n); -				if (!n) -					break; +			n = rb_prev(&entry->offset_index); +			if (n) {  				prev = rb_entry(n, struct btrfs_free_space,  						offset_index); -				if (!prev->bitmap) { -					if (prev->offset + prev->bytes > offset) -						entry = prev; -					break; -				} +				if (!prev->bitmap && +				    prev->offset + prev->bytes > offset) +					entry = prev;  			}  		}  		return entry; @@ -935,7 +1407,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,  		if (n) {  			entry = rb_entry(n, struct btrfs_free_space,  					offset_index); -			BUG_ON(entry->offset > offset); +			ASSERT(entry->offset <= offset);  		} else {  			if (fuzzy)  				return entry; @@ -945,21 +1417,15 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,  	}  	if (entry->bitmap) { -		n = &entry->offset_index; -		while (1) { -			n = rb_prev(n); -			if (!n) -				break; +		n = rb_prev(&entry->offset_index); +		if (n) {  			prev = rb_entry(n, struct btrfs_free_space,  					offset_index); -			if (!prev->bitmap) { -				if (prev->offset + prev->bytes > offset) -					return prev; -				break; -			} +			if (!prev->bitmap && +			    prev->offset + prev->bytes > offset) +				return prev;  		} -		if (entry->offset + BITS_PER_BITMAP * -		    block_group->sectorsize > offset) +		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)  			return entry;  	} else if (entry->offset + entry->bytes > offset)  		return entry; @@ -970,7 +1436,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,  	while (1) {  		if (entry->bitmap) {  			if (entry->offset + BITS_PER_BITMAP * -			    block_group->sectorsize > offset) +			    ctl->unit > offset)  				break;  		} else {  			if (entry->offset + entry->bytes > offset) @@ -985,53 +1451,71 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,  	return entry;  } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, +static inline void +__unlink_free_space(struct btrfs_free_space_ctl *ctl, +		    struct btrfs_free_space *info) +{ +	rb_erase(&info->offset_index, &ctl->free_space_offset); +	ctl->free_extents--; +} + +static void unlink_free_space(struct btrfs_free_space_ctl *ctl,  			      struct btrfs_free_space *info)  { -	rb_erase(&info->offset_index, &block_group->free_space_offset); -	block_group->free_extents--; -	block_group->free_space -= info->bytes; +	__unlink_free_space(ctl, info); +	ctl->free_space -= info->bytes;  } -static int link_free_space(struct btrfs_block_group_cache *block_group, +static int link_free_space(struct btrfs_free_space_ctl *ctl,  			   struct btrfs_free_space *info)  {  	int ret = 0; -	BUG_ON(!info->bitmap && !info->bytes); -	ret = tree_insert_offset(&block_group->free_space_offset, info->offset, +	ASSERT(info->bytes || info->bitmap); +	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,  				 &info->offset_index, (info->bitmap != NULL));  	if (ret)  		return ret; -	block_group->free_space += info->bytes; -	block_group->free_extents++; +	ctl->free_space += info->bytes; +	ctl->free_extents++;  	return ret;  } -static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)  { +	struct btrfs_block_group_cache *block_group = ctl->private;  	u64 max_bytes;  	u64 bitmap_bytes;  	u64 extent_bytes; +	u64 size = block_group->key.offset; +	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; +	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + +	max_bitmaps = max(max_bitmaps, 1); + +	ASSERT(ctl->total_bitmaps <= max_bitmaps);  	/*  	 * The goal is to keep the total amount of memory used per 1gb of space  	 * at or below 32k, so we need to adjust how much memory we allow to be  	 * used by extent based free space tracking  	 */ -	max_bytes = MAX_CACHE_BYTES_PER_GIG * -		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); +	if (size < 1024 * 1024 * 1024) +		max_bytes = MAX_CACHE_BYTES_PER_GIG; +	else +		max_bytes = MAX_CACHE_BYTES_PER_GIG * +			div64_u64(size, 1024 * 1024 * 1024);  	/*  	 * we want to account for 1 more bitmap than what we have so we can make  	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as  	 * we add more bitmaps.  	 */ -	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; +	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;  	if (bitmap_bytes >= max_bytes) { -		block_group->extents_thresh = 0; +		ctl->extents_thresh = 0;  		return;  	} @@ -1042,134 +1526,180 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)  	extent_bytes = max_bytes - bitmap_bytes;  	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); -	block_group->extents_thresh = +	ctl->extents_thresh =  		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));  } -static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, -			      struct btrfs_free_space *info, u64 offset, -			      u64 bytes) +static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, +				       struct btrfs_free_space *info, +				       u64 offset, u64 bytes)  { -	unsigned long start, end; -	unsigned long i; +	unsigned long start, count; -	start = offset_to_bit(info->offset, block_group->sectorsize, offset); -	end = start + bytes_to_bits(bytes, block_group->sectorsize); -	BUG_ON(end > BITS_PER_BITMAP); +	start = offset_to_bit(info->offset, ctl->unit, offset); +	count = bytes_to_bits(bytes, ctl->unit); +	ASSERT(start + count <= BITS_PER_BITMAP); -	for (i = start; i < end; i++) -		clear_bit(i, info->bitmap); +	bitmap_clear(info->bitmap, start, count);  	info->bytes -= bytes; -	block_group->free_space -= bytes;  } -static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, +			      struct btrfs_free_space *info, u64 offset, +			      u64 bytes) +{ +	__bitmap_clear_bits(ctl, info, offset, bytes); +	ctl->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,  			    struct btrfs_free_space *info, u64 offset,  			    u64 bytes)  { -	unsigned long start, end; -	unsigned long i; +	unsigned long start, count; -	start = offset_to_bit(info->offset, block_group->sectorsize, offset); -	end = start + bytes_to_bits(bytes, block_group->sectorsize); -	BUG_ON(end > BITS_PER_BITMAP); +	start = offset_to_bit(info->offset, ctl->unit, offset); +	count = bytes_to_bits(bytes, ctl->unit); +	ASSERT(start + count <= BITS_PER_BITMAP); -	for (i = start; i < end; i++) -		set_bit(i, info->bitmap); +	bitmap_set(info->bitmap, start, count);  	info->bytes += bytes; -	block_group->free_space += bytes; +	ctl->free_space += bytes;  } -static int search_bitmap(struct btrfs_block_group_cache *block_group, +/* + * If we can not find suitable extent, we will use bytes to record + * the size of the max extent. + */ +static int search_bitmap(struct btrfs_free_space_ctl *ctl,  			 struct btrfs_free_space *bitmap_info, u64 *offset,  			 u64 *bytes)  {  	unsigned long found_bits = 0; +	unsigned long max_bits = 0;  	unsigned long bits, i;  	unsigned long next_zero; +	unsigned long extent_bits; -	i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, +	i = offset_to_bit(bitmap_info->offset, ctl->unit,  			  max_t(u64, *offset, bitmap_info->offset)); -	bits = bytes_to_bits(*bytes, block_group->sectorsize); +	bits = bytes_to_bits(*bytes, ctl->unit); -	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); -	     i < BITS_PER_BITMAP; -	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { +	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {  		next_zero = find_next_zero_bit(bitmap_info->bitmap,  					       BITS_PER_BITMAP, i); -		if ((next_zero - i) >= bits) { -			found_bits = next_zero - i; +		extent_bits = next_zero - i; +		if (extent_bits >= bits) { +			found_bits = extent_bits;  			break; +		} else if (extent_bits > max_bits) { +			max_bits = extent_bits;  		}  		i = next_zero;  	}  	if (found_bits) { -		*offset = (u64)(i * block_group->sectorsize) + -			bitmap_info->offset; -		*bytes = (u64)(found_bits) * block_group->sectorsize; +		*offset = (u64)(i * ctl->unit) + bitmap_info->offset; +		*bytes = (u64)(found_bits) * ctl->unit;  		return 0;  	} +	*bytes = (u64)(max_bits) * ctl->unit;  	return -1;  } -static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache -						*block_group, u64 *offset, -						u64 *bytes, int debug) +/* Cache the size of the max extent in bytes */ +static struct btrfs_free_space * +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, +		unsigned long align, u64 *max_extent_size)  {  	struct btrfs_free_space *entry;  	struct rb_node *node; +	u64 tmp; +	u64 align_off;  	int ret; -	if (!block_group->free_space_offset.rb_node) -		return NULL; +	if (!ctl->free_space_offset.rb_node) +		goto out; -	entry = tree_search_offset(block_group, -				   offset_to_bitmap(block_group, *offset), -				   0, 1); +	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);  	if (!entry) -		return NULL; +		goto out;  	for (node = &entry->offset_index; node; node = rb_next(node)) {  		entry = rb_entry(node, struct btrfs_free_space, offset_index); -		if (entry->bytes < *bytes) +		if (entry->bytes < *bytes) { +			if (entry->bytes > *max_extent_size) +				*max_extent_size = entry->bytes;  			continue; +		} + +		/* make sure the space returned is big enough +		 * to match our requested alignment +		 */ +		if (*bytes >= align) { +			tmp = entry->offset - ctl->start + align - 1; +			do_div(tmp, align); +			tmp = tmp * align + ctl->start; +			align_off = tmp - entry->offset; +		} else { +			align_off = 0; +			tmp = entry->offset; +		} + +		if (entry->bytes < *bytes + align_off) { +			if (entry->bytes > *max_extent_size) +				*max_extent_size = entry->bytes; +			continue; +		}  		if (entry->bitmap) { -			ret = search_bitmap(block_group, entry, offset, bytes); -			if (!ret) +			u64 size = *bytes; + +			ret = search_bitmap(ctl, entry, &tmp, &size); +			if (!ret) { +				*offset = tmp; +				*bytes = size;  				return entry; +			} else if (size > *max_extent_size) { +				*max_extent_size = size; +			}  			continue;  		} -		*offset = entry->offset; -		*bytes = entry->bytes; +		*offset = tmp; +		*bytes = entry->bytes - align_off;  		return entry;  	} - +out:  	return NULL;  } -static void add_new_bitmap(struct btrfs_block_group_cache *block_group, +static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,  			   struct btrfs_free_space *info, u64 offset)  { -	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; -	int max_bitmaps = (int)div64_u64(block_group->key.offset + -					 bytes_per_bg - 1, bytes_per_bg); -	BUG_ON(block_group->total_bitmaps >= max_bitmaps); - -	info->offset = offset_to_bitmap(block_group, offset); +	info->offset = offset_to_bitmap(ctl, offset);  	info->bytes = 0; -	link_free_space(block_group, info); -	block_group->total_bitmaps++; +	INIT_LIST_HEAD(&info->list); +	link_free_space(ctl, info); +	ctl->total_bitmaps++; -	recalculate_thresholds(block_group); +	ctl->op->recalc_thresholds(ctl);  } -static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, +static void free_bitmap(struct btrfs_free_space_ctl *ctl, +			struct btrfs_free_space *bitmap_info) +{ +	unlink_free_space(ctl, bitmap_info); +	kfree(bitmap_info->bitmap); +	kmem_cache_free(btrfs_free_space_cachep, bitmap_info); +	ctl->total_bitmaps--; +	ctl->op->recalc_thresholds(ctl); +} + +static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,  			      struct btrfs_free_space *bitmap_info,  			      u64 *offset, u64 *bytes)  { @@ -1178,44 +1708,35 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro  	int ret;  again: -	end = bitmap_info->offset + -		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; +	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;  	/* -	 * XXX - this can go away after a few releases. -	 * -	 * since the only user of btrfs_remove_free_space is the tree logging -	 * stuff, and the only way to test that is under crash conditions, we -	 * want to have this debug stuff here just in case somethings not -	 * working.  Search the bitmap for the space we are trying to use to -	 * make sure its actually there.  If its not there then we need to stop -	 * because something has gone wrong. +	 * We need to search for bits in this bitmap.  We could only cover some +	 * of the extent in this bitmap thanks to how we add space, so we need +	 * to search for as much as it as we can and clear that amount, and then +	 * go searching for the next bit.  	 */  	search_start = *offset; -	search_bytes = *bytes; -	ret = search_bitmap(block_group, bitmap_info, &search_start, -			    &search_bytes); -	BUG_ON(ret < 0 || search_start != *offset); +	search_bytes = ctl->unit; +	search_bytes = min(search_bytes, end - search_start + 1); +	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); +	if (ret < 0 || search_start != *offset) +		return -EINVAL; -	if (*offset > bitmap_info->offset && *offset + *bytes > end) { -		bitmap_clear_bits(block_group, bitmap_info, *offset, -				  end - *offset + 1); -		*bytes -= end - *offset + 1; -		*offset = end + 1; -	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { -		bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); -		*bytes = 0; -	} +	/* We may have found more bits than what we need */ +	search_bytes = min(search_bytes, *bytes); + +	/* Cannot clear past the end of the bitmap */ +	search_bytes = min(search_bytes, end - search_start + 1); + +	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); +	*offset += search_bytes; +	*bytes -= search_bytes;  	if (*bytes) {  		struct rb_node *next = rb_next(&bitmap_info->offset_index); -		if (!bitmap_info->bytes) { -			unlink_free_space(block_group, bitmap_info); -			kfree(bitmap_info->bitmap); -			kfree(bitmap_info); -			block_group->total_bitmaps--; -			recalculate_thresholds(block_group); -		} +		if (!bitmap_info->bytes) +			free_bitmap(ctl, bitmap_info);  		/*  		 * no entry after this bitmap, but we still have bytes to @@ -1241,75 +1762,149 @@ again:  		 * everything over again.  		 */  		search_start = *offset; -		search_bytes = *bytes; -		ret = search_bitmap(block_group, bitmap_info, &search_start, +		search_bytes = ctl->unit; +		ret = search_bitmap(ctl, bitmap_info, &search_start,  				    &search_bytes);  		if (ret < 0 || search_start != *offset)  			return -EAGAIN;  		goto again; -	} else if (!bitmap_info->bytes) { -		unlink_free_space(block_group, bitmap_info); -		kfree(bitmap_info->bitmap); -		kfree(bitmap_info); -		block_group->total_bitmaps--; -		recalculate_thresholds(block_group); -	} +	} else if (!bitmap_info->bytes) +		free_bitmap(ctl, bitmap_info);  	return 0;  } -static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, -			      struct btrfs_free_space *info) +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, +			       struct btrfs_free_space *info, u64 offset, +			       u64 bytes)  { -	struct btrfs_free_space *bitmap_info; -	int added = 0; -	u64 bytes, offset, end; -	int ret; +	u64 bytes_to_set = 0; +	u64 end; + +	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + +	bytes_to_set = min(end - offset, bytes); + +	bitmap_set_bits(ctl, info, offset, bytes_to_set); + +	return bytes_to_set; + +} + +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, +		      struct btrfs_free_space *info) +{ +	struct btrfs_block_group_cache *block_group = ctl->private;  	/*  	 * If we are below the extents threshold then we can add this as an  	 * extent, and don't have to deal with the bitmap  	 */ -	if (block_group->free_extents < block_group->extents_thresh && -	    info->bytes > block_group->sectorsize * 4) -		return 0; +	if (ctl->free_extents < ctl->extents_thresh) { +		/* +		 * If this block group has some small extents we don't want to +		 * use up all of our free slots in the cache with them, we want +		 * to reserve them to larger extents, however if we have plent +		 * of cache left then go ahead an dadd them, no sense in adding +		 * the overhead of a bitmap if we don't have to. +		 */ +		if (info->bytes <= block_group->sectorsize * 4) { +			if (ctl->free_extents * 2 <= ctl->extents_thresh) +				return false; +		} else { +			return false; +		} +	}  	/* -	 * some block groups are so tiny they can't be enveloped by a bitmap, so -	 * don't even bother to create a bitmap for this +	 * The original block groups from mkfs can be really small, like 8 +	 * megabytes, so don't bother with a bitmap for those entries.  However +	 * some block groups can be smaller than what a bitmap would cover but +	 * are still large enough that they could overflow the 32k memory limit, +	 * so allow those block groups to still be allowed to have a bitmap +	 * entry.  	 */ -	if (BITS_PER_BITMAP * block_group->sectorsize > -	    block_group->key.offset) -		return 0; +	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) +		return false; + +	return true; +} + +static struct btrfs_free_space_op free_space_op = { +	.recalc_thresholds	= recalculate_thresholds, +	.use_bitmap		= use_bitmap, +}; + +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, +			      struct btrfs_free_space *info) +{ +	struct btrfs_free_space *bitmap_info; +	struct btrfs_block_group_cache *block_group = NULL; +	int added = 0; +	u64 bytes, offset, bytes_added; +	int ret;  	bytes = info->bytes;  	offset = info->offset; +	if (!ctl->op->use_bitmap(ctl, info)) +		return 0; + +	if (ctl->op == &free_space_op) +		block_group = ctl->private;  again: -	bitmap_info = tree_search_offset(block_group, -					 offset_to_bitmap(block_group, offset), +	/* +	 * Since we link bitmaps right into the cluster we need to see if we +	 * have a cluster here, and if so and it has our bitmap we need to add +	 * the free space to that bitmap. +	 */ +	if (block_group && !list_empty(&block_group->cluster_list)) { +		struct btrfs_free_cluster *cluster; +		struct rb_node *node; +		struct btrfs_free_space *entry; + +		cluster = list_entry(block_group->cluster_list.next, +				     struct btrfs_free_cluster, +				     block_group_list); +		spin_lock(&cluster->lock); +		node = rb_first(&cluster->root); +		if (!node) { +			spin_unlock(&cluster->lock); +			goto no_cluster_bitmap; +		} + +		entry = rb_entry(node, struct btrfs_free_space, offset_index); +		if (!entry->bitmap) { +			spin_unlock(&cluster->lock); +			goto no_cluster_bitmap; +		} + +		if (entry->offset == offset_to_bitmap(ctl, offset)) { +			bytes_added = add_bytes_to_bitmap(ctl, entry, +							  offset, bytes); +			bytes -= bytes_added; +			offset += bytes_added; +		} +		spin_unlock(&cluster->lock); +		if (!bytes) { +			ret = 1; +			goto out; +		} +	} + +no_cluster_bitmap: +	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),  					 1, 0);  	if (!bitmap_info) { -		BUG_ON(added); +		ASSERT(added == 0);  		goto new_bitmap;  	} -	end = bitmap_info->offset + -		(u64)(BITS_PER_BITMAP * block_group->sectorsize); - -	if (offset >= bitmap_info->offset && offset + bytes > end) { -		bitmap_set_bits(block_group, bitmap_info, offset, -				end - offset); -		bytes -= end - offset; -		offset = end; -		added = 0; -	} else if (offset >= bitmap_info->offset && offset + bytes <= end) { -		bitmap_set_bits(block_group, bitmap_info, offset, bytes); -		bytes = 0; -	} else { -		BUG(); -	} +	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); +	bytes -= bytes_added; +	offset += bytes_added; +	added = 0;  	if (!bytes) {  		ret = 1; @@ -1319,19 +1914,19 @@ again:  new_bitmap:  	if (info && info->bitmap) { -		add_new_bitmap(block_group, info, offset); +		add_new_bitmap(ctl, info, offset);  		added = 1;  		info = NULL;  		goto again;  	} else { -		spin_unlock(&block_group->tree_lock); +		spin_unlock(&ctl->tree_lock);  		/* no pre-allocated info, allocate a new one */  		if (!info) { -			info = kzalloc(sizeof(struct btrfs_free_space), -				       GFP_NOFS); +			info = kmem_cache_zalloc(btrfs_free_space_cachep, +						 GFP_NOFS);  			if (!info) { -				spin_lock(&block_group->tree_lock); +				spin_lock(&ctl->tree_lock);  				ret = -ENOMEM;  				goto out;  			} @@ -1339,7 +1934,7 @@ new_bitmap:  		/* allocate the bitmap */  		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); -		spin_lock(&block_group->tree_lock); +		spin_lock(&ctl->tree_lock);  		if (!info->bitmap) {  			ret = -ENOMEM;  			goto out; @@ -1351,81 +1946,98 @@ out:  	if (info) {  		if (info->bitmap)  			kfree(info->bitmap); -		kfree(info); +		kmem_cache_free(btrfs_free_space_cachep, info);  	}  	return ret;  } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, -			 u64 offset, u64 bytes) +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, +			  struct btrfs_free_space *info, bool update_stat)  { -	struct btrfs_free_space *right_info = NULL; -	struct btrfs_free_space *left_info = NULL; -	struct btrfs_free_space *info = NULL; -	int ret = 0; - -	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); -	if (!info) -		return -ENOMEM; - -	info->offset = offset; -	info->bytes = bytes; - -	spin_lock(&block_group->tree_lock); +	struct btrfs_free_space *left_info; +	struct btrfs_free_space *right_info; +	bool merged = false; +	u64 offset = info->offset; +	u64 bytes = info->bytes;  	/*  	 * first we want to see if there is free space adjacent to the range we  	 * are adding, if there is remove that struct and add a new one to  	 * cover the entire range  	 */ -	right_info = tree_search_offset(block_group, offset + bytes, 0, 0); +	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);  	if (right_info && rb_prev(&right_info->offset_index))  		left_info = rb_entry(rb_prev(&right_info->offset_index),  				     struct btrfs_free_space, offset_index);  	else -		left_info = tree_search_offset(block_group, offset - 1, 0, 0); - -	/* -	 * If there was no extent directly to the left or right of this new -	 * extent then we know we're going to have to allocate a new extent, so -	 * before we do that see if we need to drop this into a bitmap -	 */ -	if ((!left_info || left_info->bitmap) && -	    (!right_info || right_info->bitmap)) { -		ret = insert_into_bitmap(block_group, info); - -		if (ret < 0) { -			goto out; -		} else if (ret) { -			ret = 0; -			goto out; -		} -	} +		left_info = tree_search_offset(ctl, offset - 1, 0, 0);  	if (right_info && !right_info->bitmap) { -		unlink_free_space(block_group, right_info); +		if (update_stat) +			unlink_free_space(ctl, right_info); +		else +			__unlink_free_space(ctl, right_info);  		info->bytes += right_info->bytes; -		kfree(right_info); +		kmem_cache_free(btrfs_free_space_cachep, right_info); +		merged = true;  	}  	if (left_info && !left_info->bitmap &&  	    left_info->offset + left_info->bytes == offset) { -		unlink_free_space(block_group, left_info); +		if (update_stat) +			unlink_free_space(ctl, left_info); +		else +			__unlink_free_space(ctl, left_info);  		info->offset = left_info->offset;  		info->bytes += left_info->bytes; -		kfree(left_info); +		kmem_cache_free(btrfs_free_space_cachep, left_info); +		merged = true;  	} -	ret = link_free_space(block_group, info); +	return merged; +} + +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, +			   u64 offset, u64 bytes) +{ +	struct btrfs_free_space *info; +	int ret = 0; + +	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); +	if (!info) +		return -ENOMEM; + +	info->offset = offset; +	info->bytes = bytes; + +	spin_lock(&ctl->tree_lock); + +	if (try_merge_free_space(ctl, info, true)) +		goto link; + +	/* +	 * There was no extent directly to the left or right of this new +	 * extent then we know we're going to have to allocate a new extent, so +	 * before we do that see if we need to drop this into a bitmap +	 */ +	ret = insert_into_bitmap(ctl, info); +	if (ret < 0) { +		goto out; +	} else if (ret) { +		ret = 0; +		goto out; +	} +link: +	ret = link_free_space(ctl, info);  	if (ret) -		kfree(info); +		kmem_cache_free(btrfs_free_space_cachep, info);  out: -	spin_unlock(&block_group->tree_lock); +	spin_unlock(&ctl->tree_lock);  	if (ret) { -		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); -		BUG_ON(ret == -EEXIST); +		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); +		ASSERT(ret != -EEXIST);  	}  	return ret; @@ -1434,116 +2046,89 @@ out:  int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,  			    u64 offset, u64 bytes)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *info; -	struct btrfs_free_space *next_info = NULL; -	int ret = 0; +	int ret; +	bool re_search = false; -	spin_lock(&block_group->tree_lock); +	spin_lock(&ctl->tree_lock);  again: -	info = tree_search_offset(block_group, offset, 0, 0); +	ret = 0; +	if (!bytes) +		goto out_lock; + +	info = tree_search_offset(ctl, offset, 0, 0);  	if (!info) {  		/*  		 * oops didn't find an extent that matched the space we wanted  		 * to remove, look for a bitmap instead  		 */ -		info = tree_search_offset(block_group, -					  offset_to_bitmap(block_group, offset), +		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),  					  1, 0);  		if (!info) { -			WARN_ON(1); -			goto out_lock; -		} -	} - -	if (info->bytes < bytes && rb_next(&info->offset_index)) { -		u64 end; -		next_info = rb_entry(rb_next(&info->offset_index), -					     struct btrfs_free_space, -					     offset_index); - -		if (next_info->bitmap) -			end = next_info->offset + BITS_PER_BITMAP * -				block_group->sectorsize - 1; -		else -			end = next_info->offset + next_info->bytes; - -		if (next_info->bytes < bytes || -		    next_info->offset > offset || offset > end) { -			printk(KERN_CRIT "Found free space at %llu, size %llu," -			      " trying to use %llu\n", -			      (unsigned long long)info->offset, -			      (unsigned long long)info->bytes, -			      (unsigned long long)bytes); -			WARN_ON(1); -			ret = -EINVAL; +			/* +			 * If we found a partial bit of our free space in a +			 * bitmap but then couldn't find the other part this may +			 * be a problem, so WARN about it. +			 */ +			WARN_ON(re_search);  			goto out_lock;  		} - -		info = next_info;  	} -	if (info->bytes == bytes) { -		unlink_free_space(block_group, info); -		if (info->bitmap) { -			kfree(info->bitmap); -			block_group->total_bitmaps--; -		} -		kfree(info); -		goto out_lock; -	} - -	if (!info->bitmap && info->offset == offset) { -		unlink_free_space(block_group, info); -		info->offset += bytes; -		info->bytes -= bytes; -		link_free_space(block_group, info); -		goto out_lock; -	} +	re_search = false; +	if (!info->bitmap) { +		unlink_free_space(ctl, info); +		if (offset == info->offset) { +			u64 to_free = min(bytes, info->bytes); + +			info->bytes -= to_free; +			info->offset += to_free; +			if (info->bytes) { +				ret = link_free_space(ctl, info); +				WARN_ON(ret); +			} else { +				kmem_cache_free(btrfs_free_space_cachep, info); +			} -	if (!info->bitmap && info->offset <= offset && -	    info->offset + info->bytes >= offset + bytes) { -		u64 old_start = info->offset; -		/* -		 * we're freeing space in the middle of the info, -		 * this can happen during tree log replay -		 * -		 * first unlink the old info and then -		 * insert it again after the hole we're creating -		 */ -		unlink_free_space(block_group, info); -		if (offset + bytes < info->offset + info->bytes) { -			u64 old_end = info->offset + info->bytes; +			offset += to_free; +			bytes -= to_free; +			goto again; +		} else { +			u64 old_end = info->bytes + info->offset; -			info->offset = offset + bytes; -			info->bytes = old_end - info->offset; -			ret = link_free_space(block_group, info); +			info->bytes = offset - info->offset; +			ret = link_free_space(ctl, info);  			WARN_ON(ret);  			if (ret)  				goto out_lock; -		} else { -			/* the hole we're creating ends at the end -			 * of the info struct, just free the info -			 */ -			kfree(info); -		} -		spin_unlock(&block_group->tree_lock); -		/* step two, insert a new info struct to cover -		 * anything before the hole -		 */ -		ret = btrfs_add_free_space(block_group, old_start, -					   offset - old_start); -		WARN_ON(ret); -		goto out; +			/* Not enough bytes in this entry to satisfy us */ +			if (old_end < offset + bytes) { +				bytes -= old_end - offset; +				offset = old_end; +				goto again; +			} else if (old_end == offset + bytes) { +				/* all done */ +				goto out_lock; +			} +			spin_unlock(&ctl->tree_lock); + +			ret = btrfs_add_free_space(block_group, offset + bytes, +						   old_end - (offset + bytes)); +			WARN_ON(ret); +			goto out; +		}  	} -	ret = remove_from_bitmap(block_group, info, &offset, &bytes); -	if (ret == -EAGAIN) +	ret = remove_from_bitmap(ctl, info, &offset, &bytes); +	if (ret == -EAGAIN) { +		re_search = true;  		goto again; -	BUG_ON(ret); +	}  out_lock: -	spin_unlock(&block_group->tree_lock); +	spin_unlock(&ctl->tree_lock);  out:  	return ret;  } @@ -1551,38 +2136,43 @@ out:  void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,  			   u64 bytes)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *info;  	struct rb_node *n;  	int count = 0; -	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { +	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {  		info = rb_entry(n, struct btrfs_free_space, offset_index); -		if (info->bytes >= bytes) +		if (info->bytes >= bytes && !block_group->ro)  			count++; -		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", -		       (unsigned long long)info->offset, -		       (unsigned long long)info->bytes, +		btrfs_crit(block_group->fs_info, +			   "entry offset %llu, bytes %llu, bitmap %s", +			   info->offset, info->bytes,  		       (info->bitmap) ? "yes" : "no");  	} -	printk(KERN_INFO "block group has cluster?: %s\n", +	btrfs_info(block_group->fs_info, "block group has cluster?: %s",  	       list_empty(&block_group->cluster_list) ? "no" : "yes"); -	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" -	       "\n", count); +	btrfs_info(block_group->fs_info, +		   "%d blocks of free space at or bigger than bytes is", count);  } -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)  { -	struct btrfs_free_space *info; -	struct rb_node *n; -	u64 ret = 0; +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; -	for (n = rb_first(&block_group->free_space_offset); n; -	     n = rb_next(n)) { -		info = rb_entry(n, struct btrfs_free_space, offset_index); -		ret += info->bytes; -	} +	spin_lock_init(&ctl->tree_lock); +	ctl->unit = block_group->sectorsize; +	ctl->start = block_group->key.objectid; +	ctl->private = block_group; +	ctl->op = &free_space_op; -	return ret; +	/* +	 * we only want to have 32k of ram per block group for keeping +	 * track of free space, and if we pass 1/2 of that we want to +	 * start converting things over to using bitmaps +	 */ +	ctl->extents_thresh = ((1024 * 32) / 2) / +				sizeof(struct btrfs_free_space);  }  /* @@ -1596,31 +2186,31 @@ __btrfs_return_cluster_to_free_space(  			     struct btrfs_block_group_cache *block_group,  			     struct btrfs_free_cluster *cluster)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *entry;  	struct rb_node *node; -	bool bitmap;  	spin_lock(&cluster->lock);  	if (cluster->block_group != block_group)  		goto out; -	bitmap = cluster->points_to_bitmap;  	cluster->block_group = NULL;  	cluster->window_start = 0;  	list_del_init(&cluster->block_group_list); -	cluster->points_to_bitmap = false; - -	if (bitmap) -		goto out;  	node = rb_first(&cluster->root);  	while (node) { +		bool bitmap; +  		entry = rb_entry(node, struct btrfs_free_space, offset_index);  		node = rb_next(&entry->offset_index);  		rb_erase(&entry->offset_index, &cluster->root); -		BUG_ON(entry->bitmap); -		tree_insert_offset(&block_group->free_space_offset, -				   entry->offset, &entry->offset_index, 0); + +		bitmap = (entry->bitmap != NULL); +		if (!bitmap) +			try_merge_free_space(ctl, entry, false); +		tree_insert_offset(&ctl->free_space_offset, +				   entry->offset, &entry->offset_index, bitmap);  	}  	cluster->root = RB_ROOT; @@ -1630,14 +2220,42 @@ out:  	return 0;  } -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +static void __btrfs_remove_free_space_cache_locked( +				struct btrfs_free_space_ctl *ctl)  {  	struct btrfs_free_space *info;  	struct rb_node *node; + +	while ((node = rb_last(&ctl->free_space_offset)) != NULL) { +		info = rb_entry(node, struct btrfs_free_space, offset_index); +		if (!info->bitmap) { +			unlink_free_space(ctl, info); +			kmem_cache_free(btrfs_free_space_cachep, info); +		} else { +			free_bitmap(ctl, info); +		} +		if (need_resched()) { +			spin_unlock(&ctl->tree_lock); +			cond_resched(); +			spin_lock(&ctl->tree_lock); +		} +	} +} + +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ +	spin_lock(&ctl->tree_lock); +	__btrfs_remove_free_space_cache_locked(ctl); +	spin_unlock(&ctl->tree_lock); +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_cluster *cluster;  	struct list_head *head; -	spin_lock(&block_group->tree_lock); +	spin_lock(&ctl->tree_lock);  	while ((head = block_group->cluster_list.next) !=  	       &block_group->cluster_list) {  		cluster = list_entry(head, struct btrfs_free_cluster, @@ -1646,63 +2264,57 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)  		WARN_ON(cluster->block_group != block_group);  		__btrfs_return_cluster_to_free_space(block_group, cluster);  		if (need_resched()) { -			spin_unlock(&block_group->tree_lock); -			cond_resched(); -			spin_lock(&block_group->tree_lock); -		} -	} - -	while ((node = rb_last(&block_group->free_space_offset)) != NULL) { -		info = rb_entry(node, struct btrfs_free_space, offset_index); -		unlink_free_space(block_group, info); -		if (info->bitmap) -			kfree(info->bitmap); -		kfree(info); -		if (need_resched()) { -			spin_unlock(&block_group->tree_lock); +			spin_unlock(&ctl->tree_lock);  			cond_resched(); -			spin_lock(&block_group->tree_lock); +			spin_lock(&ctl->tree_lock);  		}  	} +	__btrfs_remove_free_space_cache_locked(ctl); +	spin_unlock(&ctl->tree_lock); -	spin_unlock(&block_group->tree_lock);  }  u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, -			       u64 offset, u64 bytes, u64 empty_size) +			       u64 offset, u64 bytes, u64 empty_size, +			       u64 *max_extent_size)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *entry = NULL;  	u64 bytes_search = bytes + empty_size;  	u64 ret = 0; +	u64 align_gap = 0; +	u64 align_gap_len = 0; -	spin_lock(&block_group->tree_lock); -	entry = find_free_space(block_group, &offset, &bytes_search, 0); +	spin_lock(&ctl->tree_lock); +	entry = find_free_space(ctl, &offset, &bytes_search, +				block_group->full_stripe_len, max_extent_size);  	if (!entry)  		goto out;  	ret = offset;  	if (entry->bitmap) { -		bitmap_clear_bits(block_group, entry, offset, bytes); -		if (!entry->bytes) { -			unlink_free_space(block_group, entry); -			kfree(entry->bitmap); -			kfree(entry); -			block_group->total_bitmaps--; -			recalculate_thresholds(block_group); -		} +		bitmap_clear_bits(ctl, entry, offset, bytes); +		if (!entry->bytes) +			free_bitmap(ctl, entry);  	} else { -		unlink_free_space(block_group, entry); -		entry->offset += bytes; -		entry->bytes -= bytes; +		unlink_free_space(ctl, entry); +		align_gap_len = offset - entry->offset; +		align_gap = entry->offset; + +		entry->offset = offset + bytes; +		WARN_ON(entry->bytes < bytes + align_gap_len); + +		entry->bytes -= bytes + align_gap_len;  		if (!entry->bytes) -			kfree(entry); +			kmem_cache_free(btrfs_free_space_cachep, entry);  		else -			link_free_space(block_group, entry); +			link_free_space(ctl, entry);  	} -  out: -	spin_unlock(&block_group->tree_lock); +	spin_unlock(&ctl->tree_lock); +	if (align_gap_len) +		__btrfs_add_free_space(ctl, align_gap, align_gap_len);  	return ret;  } @@ -1718,6 +2330,7 @@ int btrfs_return_cluster_to_free_space(  			       struct btrfs_block_group_cache *block_group,  			       struct btrfs_free_cluster *cluster)  { +	struct btrfs_free_space_ctl *ctl;  	int ret;  	/* first, get a safe pointer to the block group */ @@ -1736,10 +2349,12 @@ int btrfs_return_cluster_to_free_space(  	atomic_inc(&block_group->count);  	spin_unlock(&cluster->lock); +	ctl = block_group->free_space_ctl; +  	/* now return any extents the cluster had on it */ -	spin_lock(&block_group->tree_lock); +	spin_lock(&ctl->tree_lock);  	ret = __btrfs_return_cluster_to_free_space(block_group, cluster); -	spin_unlock(&block_group->tree_lock); +	spin_unlock(&ctl->tree_lock);  	/* finally drop our ref */  	btrfs_put_block_group(block_group); @@ -1748,48 +2363,28 @@ int btrfs_return_cluster_to_free_space(  static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,  				   struct btrfs_free_cluster *cluster, -				   u64 bytes, u64 min_start) +				   struct btrfs_free_space *entry, +				   u64 bytes, u64 min_start, +				   u64 *max_extent_size)  { -	struct btrfs_free_space *entry; +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	int err;  	u64 search_start = cluster->window_start;  	u64 search_bytes = bytes;  	u64 ret = 0; -	spin_lock(&block_group->tree_lock); -	spin_lock(&cluster->lock); - -	if (!cluster->points_to_bitmap) -		goto out; - -	if (cluster->block_group != block_group) -		goto out; - -	/* -	 * search_start is the beginning of the bitmap, but at some point it may -	 * be a good idea to point to the actual start of the free area in the -	 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only -	 * to 1 to make sure we get the bitmap entry -	 */ -	entry = tree_search_offset(block_group, -				   offset_to_bitmap(block_group, search_start), -				   1, 0); -	if (!entry || !entry->bitmap) -		goto out; -  	search_start = min_start;  	search_bytes = bytes; -	err = search_bitmap(block_group, entry, &search_start, -			    &search_bytes); -	if (err) -		goto out; +	err = search_bitmap(ctl, entry, &search_start, &search_bytes); +	if (err) { +		if (search_bytes > *max_extent_size) +			*max_extent_size = search_bytes; +		return 0; +	}  	ret = search_start; -	bitmap_clear_bits(block_group, entry, ret, bytes); -out: -	spin_unlock(&cluster->lock); -	spin_unlock(&block_group->tree_lock); +	__bitmap_clear_bits(ctl, entry, ret, bytes);  	return ret;  } @@ -1801,16 +2396,13 @@ out:   */  u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  			     struct btrfs_free_cluster *cluster, u64 bytes, -			     u64 min_start) +			     u64 min_start, u64 *max_extent_size)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	struct btrfs_free_space *entry = NULL;  	struct rb_node *node;  	u64 ret = 0; -	if (cluster->points_to_bitmap) -		return btrfs_alloc_from_bitmap(block_group, cluster, bytes, -					       min_start); -  	spin_lock(&cluster->lock);  	if (bytes > cluster->max_size)  		goto out; @@ -1823,11 +2415,12 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  		goto out;  	entry = rb_entry(node, struct btrfs_free_space, offset_index); +	while (1) { +		if (entry->bytes < bytes && entry->bytes > *max_extent_size) +			*max_extent_size = entry->bytes; -	while(1) { -		if (entry->bytes < bytes || entry->offset < min_start) { -			struct rb_node *node; - +		if (entry->bytes < bytes || +		    (!entry->bitmap && entry->offset < min_start)) {  			node = rb_next(&entry->offset_index);  			if (!node)  				break; @@ -1835,50 +2428,83 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  					 offset_index);  			continue;  		} -		ret = entry->offset; -		entry->offset += bytes; -		entry->bytes -= bytes; +		if (entry->bitmap) { +			ret = btrfs_alloc_from_bitmap(block_group, +						      cluster, entry, bytes, +						      cluster->window_start, +						      max_extent_size); +			if (ret == 0) { +				node = rb_next(&entry->offset_index); +				if (!node) +					break; +				entry = rb_entry(node, struct btrfs_free_space, +						 offset_index); +				continue; +			} +			cluster->window_start += bytes; +		} else { +			ret = entry->offset; -		if (entry->bytes == 0) { -			rb_erase(&entry->offset_index, &cluster->root); -			kfree(entry); +			entry->offset += bytes; +			entry->bytes -= bytes;  		} + +		if (entry->bytes == 0) +			rb_erase(&entry->offset_index, &cluster->root);  		break;  	}  out:  	spin_unlock(&cluster->lock); +	if (!ret) +		return 0; + +	spin_lock(&ctl->tree_lock); + +	ctl->free_space -= bytes; +	if (entry->bytes == 0) { +		ctl->free_extents--; +		if (entry->bitmap) { +			kfree(entry->bitmap); +			ctl->total_bitmaps--; +			ctl->op->recalc_thresholds(ctl); +		} +		kmem_cache_free(btrfs_free_space_cachep, entry); +	} + +	spin_unlock(&ctl->tree_lock); +  	return ret;  }  static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,  				struct btrfs_free_space *entry,  				struct btrfs_free_cluster *cluster, -				u64 offset, u64 bytes, u64 min_bytes) +				u64 offset, u64 bytes, +				u64 cont1_bytes, u64 min_bytes)  { +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;  	unsigned long next_zero;  	unsigned long i; -	unsigned long search_bits; -	unsigned long total_bits; +	unsigned long want_bits; +	unsigned long min_bits;  	unsigned long found_bits;  	unsigned long start = 0;  	unsigned long total_found = 0; -	bool found = false; +	int ret; -	i = offset_to_bit(entry->offset, block_group->sectorsize, +	i = offset_to_bit(entry->offset, ctl->unit,  			  max_t(u64, offset, entry->offset)); -	search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); -	total_bits = bytes_to_bits(bytes, block_group->sectorsize); +	want_bits = bytes_to_bits(bytes, ctl->unit); +	min_bits = bytes_to_bits(min_bytes, ctl->unit);  again:  	found_bits = 0; -	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); -	     i < BITS_PER_BITMAP; -	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { +	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {  		next_zero = find_next_zero_bit(entry->bitmap,  					       BITS_PER_BITMAP, i); -		if (next_zero - i >= search_bits) { +		if (next_zero - i >= min_bits) {  			found_bits = next_zero - i;  			break;  		} @@ -1886,77 +2512,218 @@ again:  	}  	if (!found_bits) -		return -1; +		return -ENOSPC; -	if (!found) { +	if (!total_found) {  		start = i; -		found = true; +		cluster->max_size = 0;  	}  	total_found += found_bits; -	if (cluster->max_size < found_bits * block_group->sectorsize) -		cluster->max_size = found_bits * block_group->sectorsize; +	if (cluster->max_size < found_bits * ctl->unit) +		cluster->max_size = found_bits * ctl->unit; -	if (total_found < total_bits) { -		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); -		if (i - start > total_bits * 2) { -			total_found = 0; -			cluster->max_size = 0; -			found = false; -		} +	if (total_found < want_bits || cluster->max_size < cont1_bytes) { +		i = next_zero + 1;  		goto again;  	} -	cluster->window_start = start * block_group->sectorsize + -		entry->offset; -	cluster->points_to_bitmap = true; +	cluster->window_start = start * ctl->unit + entry->offset; +	rb_erase(&entry->offset_index, &ctl->free_space_offset); +	ret = tree_insert_offset(&cluster->root, entry->offset, +				 &entry->offset_index, 1); +	ASSERT(!ret); /* -EEXIST; Logic error */ +	trace_btrfs_setup_cluster(block_group, cluster, +				  total_found * ctl->unit, 1);  	return 0;  }  /* + * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. + */ +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, +			struct btrfs_free_cluster *cluster, +			struct list_head *bitmaps, u64 offset, u64 bytes, +			u64 cont1_bytes, u64 min_bytes) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_free_space *first = NULL; +	struct btrfs_free_space *entry = NULL; +	struct btrfs_free_space *last; +	struct rb_node *node; +	u64 window_free; +	u64 max_extent; +	u64 total_size = 0; + +	entry = tree_search_offset(ctl, offset, 0, 1); +	if (!entry) +		return -ENOSPC; + +	/* +	 * We don't want bitmaps, so just move along until we find a normal +	 * extent entry. +	 */ +	while (entry->bitmap || entry->bytes < min_bytes) { +		if (entry->bitmap && list_empty(&entry->list)) +			list_add_tail(&entry->list, bitmaps); +		node = rb_next(&entry->offset_index); +		if (!node) +			return -ENOSPC; +		entry = rb_entry(node, struct btrfs_free_space, offset_index); +	} + +	window_free = entry->bytes; +	max_extent = entry->bytes; +	first = entry; +	last = entry; + +	for (node = rb_next(&entry->offset_index); node; +	     node = rb_next(&entry->offset_index)) { +		entry = rb_entry(node, struct btrfs_free_space, offset_index); + +		if (entry->bitmap) { +			if (list_empty(&entry->list)) +				list_add_tail(&entry->list, bitmaps); +			continue; +		} + +		if (entry->bytes < min_bytes) +			continue; + +		last = entry; +		window_free += entry->bytes; +		if (entry->bytes > max_extent) +			max_extent = entry->bytes; +	} + +	if (window_free < bytes || max_extent < cont1_bytes) +		return -ENOSPC; + +	cluster->window_start = first->offset; + +	node = &first->offset_index; + +	/* +	 * now we've found our entries, pull them out of the free space +	 * cache and put them into the cluster rbtree +	 */ +	do { +		int ret; + +		entry = rb_entry(node, struct btrfs_free_space, offset_index); +		node = rb_next(&entry->offset_index); +		if (entry->bitmap || entry->bytes < min_bytes) +			continue; + +		rb_erase(&entry->offset_index, &ctl->free_space_offset); +		ret = tree_insert_offset(&cluster->root, entry->offset, +					 &entry->offset_index, 0); +		total_size += entry->bytes; +		ASSERT(!ret); /* -EEXIST; Logic error */ +	} while (node && entry != last); + +	cluster->max_size = max_extent; +	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); +	return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, +		     struct btrfs_free_cluster *cluster, +		     struct list_head *bitmaps, u64 offset, u64 bytes, +		     u64 cont1_bytes, u64 min_bytes) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_free_space *entry; +	int ret = -ENOSPC; +	u64 bitmap_offset = offset_to_bitmap(ctl, offset); + +	if (ctl->total_bitmaps == 0) +		return -ENOSPC; + +	/* +	 * The bitmap that covers offset won't be in the list unless offset +	 * is just its start offset. +	 */ +	entry = list_first_entry(bitmaps, struct btrfs_free_space, list); +	if (entry->offset != bitmap_offset) { +		entry = tree_search_offset(ctl, bitmap_offset, 1, 0); +		if (entry && list_empty(&entry->list)) +			list_add(&entry->list, bitmaps); +	} + +	list_for_each_entry(entry, bitmaps, list) { +		if (entry->bytes < bytes) +			continue; +		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, +					   bytes, cont1_bytes, min_bytes); +		if (!ret) +			return 0; +	} + +	/* +	 * The bitmaps list has all the bitmaps that record free space +	 * starting after offset, so no more search is required. +	 */ +	return -ENOSPC; +} + +/*   * here we try to find a cluster of blocks in a block group.  The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size.   * We might not find them all in one contiguous area.   *   * returns zero and sets up cluster if things worked out, otherwise   * it returns -enospc   */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, -			     struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root,  			     struct btrfs_block_group_cache *block_group,  			     struct btrfs_free_cluster *cluster,  			     u64 offset, u64 bytes, u64 empty_size)  { -	struct btrfs_free_space *entry = NULL; -	struct rb_node *node; -	struct btrfs_free_space *next; -	struct btrfs_free_space *last = NULL; +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_free_space *entry, *tmp; +	LIST_HEAD(bitmaps);  	u64 min_bytes; -	u64 window_start; -	u64 window_free; -	u64 max_extent = 0; -	bool found_bitmap = false; +	u64 cont1_bytes;  	int ret; -	/* for metadata, allow allocates with more holes */ +	/* +	 * Choose the minimum extent size we'll require for this +	 * cluster.  For SSD_SPREAD, don't allow any fragmentation. +	 * For metadata, allow allocates with smaller extents.  For +	 * data, keep it dense. +	 */  	if (btrfs_test_opt(root, SSD_SPREAD)) { -		min_bytes = bytes + empty_size; +		cont1_bytes = min_bytes = bytes + empty_size;  	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { -		/* -		 * we want to do larger allocations when we are -		 * flushing out the delayed refs, it helps prevent -		 * making more work as we go along. -		 */ -		if (trans->transaction->delayed_refs.flushing) -			min_bytes = max(bytes, (bytes + empty_size) >> 1); -		else -			min_bytes = max(bytes, (bytes + empty_size) >> 4); -	} else -		min_bytes = max(bytes, (bytes + empty_size) >> 2); +		cont1_bytes = bytes; +		min_bytes = block_group->sectorsize; +	} else { +		cont1_bytes = max(bytes, (bytes + empty_size) >> 2); +		min_bytes = block_group->sectorsize; +	} + +	spin_lock(&ctl->tree_lock); + +	/* +	 * If we know we don't have enough space to make a cluster don't even +	 * bother doing all the work to try and find one. +	 */ +	if (ctl->free_space < bytes) { +		spin_unlock(&ctl->tree_lock); +		return -ENOSPC; +	} -	spin_lock(&block_group->tree_lock);  	spin_lock(&cluster->lock);  	/* someone already found a cluster, hooray */ @@ -1964,153 +2731,547 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,  		ret = 0;  		goto out;  	} -again: -	entry = tree_search_offset(block_group, offset, found_bitmap, 1); -	if (!entry) { -		ret = -ENOSPC; -		goto out; + +	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, +				 min_bytes); + +	INIT_LIST_HEAD(&bitmaps); +	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, +				      bytes + empty_size, +				      cont1_bytes, min_bytes); +	if (ret) +		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, +					   offset, bytes + empty_size, +					   cont1_bytes, min_bytes); + +	/* Clear our temporary list */ +	list_for_each_entry_safe(entry, tmp, &bitmaps, list) +		list_del_init(&entry->list); + +	if (!ret) { +		atomic_inc(&block_group->count); +		list_add_tail(&cluster->block_group_list, +			      &block_group->cluster_list); +		cluster->block_group = block_group; +	} else { +		trace_btrfs_failed_cluster_setup(block_group);  	} +out: +	spin_unlock(&cluster->lock); +	spin_unlock(&ctl->tree_lock); -	/* -	 * If found_bitmap is true, we exhausted our search for extent entries, -	 * and we just want to search all of the bitmaps that we can find, and -	 * ignore any extent entries we find. -	 */ -	while (entry->bitmap || found_bitmap || -	       (!entry->bitmap && entry->bytes < min_bytes)) { -		struct rb_node *node = rb_next(&entry->offset_index); - -		if (entry->bitmap && entry->bytes > bytes + empty_size) { -			ret = btrfs_bitmap_cluster(block_group, entry, cluster, -						   offset, bytes + empty_size, -						   min_bytes); -			if (!ret) -				goto got_it; -		} +	return ret; +} -		if (!node) { -			ret = -ENOSPC; -			goto out; -		} -		entry = rb_entry(node, struct btrfs_free_space, offset_index); +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ +	spin_lock_init(&cluster->lock); +	spin_lock_init(&cluster->refill_lock); +	cluster->root = RB_ROOT; +	cluster->max_size = 0; +	INIT_LIST_HEAD(&cluster->block_group_list); +	cluster->block_group = NULL; +} + +static int do_trimming(struct btrfs_block_group_cache *block_group, +		       u64 *total_trimmed, u64 start, u64 bytes, +		       u64 reserved_start, u64 reserved_bytes) +{ +	struct btrfs_space_info *space_info = block_group->space_info; +	struct btrfs_fs_info *fs_info = block_group->fs_info; +	int ret; +	int update = 0; +	u64 trimmed = 0; + +	spin_lock(&space_info->lock); +	spin_lock(&block_group->lock); +	if (!block_group->ro) { +		block_group->reserved += reserved_bytes; +		space_info->bytes_reserved += reserved_bytes; +		update = 1;  	} +	spin_unlock(&block_group->lock); +	spin_unlock(&space_info->lock); -	/* -	 * We already searched all the extent entries from the passed in offset -	 * to the end and didn't find enough space for the cluster, and we also -	 * didn't find any bitmaps that met our criteria, just go ahead and exit -	 */ -	if (found_bitmap) { -		ret = -ENOSPC; -		goto out; +	ret = btrfs_error_discard_extent(fs_info->extent_root, +					 start, bytes, &trimmed); +	if (!ret) +		*total_trimmed += trimmed; + +	btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + +	if (update) { +		spin_lock(&space_info->lock); +		spin_lock(&block_group->lock); +		if (block_group->ro) +			space_info->bytes_readonly += reserved_bytes; +		block_group->reserved -= reserved_bytes; +		space_info->bytes_reserved -= reserved_bytes; +		spin_unlock(&space_info->lock); +		spin_unlock(&block_group->lock);  	} -	cluster->points_to_bitmap = false; -	window_start = entry->offset; -	window_free = entry->bytes; -	last = entry; -	max_extent = entry->bytes; +	return ret; +} -	while (1) { -		/* out window is just right, lets fill it */ -		if (window_free >= bytes + empty_size) +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, +			  u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_free_space *entry; +	struct rb_node *node; +	int ret = 0; +	u64 extent_start; +	u64 extent_bytes; +	u64 bytes; + +	while (start < end) { +		spin_lock(&ctl->tree_lock); + +		if (ctl->free_space < minlen) { +			spin_unlock(&ctl->tree_lock);  			break; +		} -		node = rb_next(&last->offset_index); -		if (!node) { -			if (found_bitmap) -				goto again; -			ret = -ENOSPC; -			goto out; +		entry = tree_search_offset(ctl, start, 0, 1); +		if (!entry) { +			spin_unlock(&ctl->tree_lock); +			break;  		} -		next = rb_entry(node, struct btrfs_free_space, offset_index); -		/* -		 * we found a bitmap, so if this search doesn't result in a -		 * cluster, we know to go and search again for the bitmaps and -		 * start looking for space there -		 */ -		if (next->bitmap) { -			if (!found_bitmap) -				offset = next->offset; -			found_bitmap = true; -			last = next; -			continue; +		/* skip bitmaps */ +		while (entry->bitmap) { +			node = rb_next(&entry->offset_index); +			if (!node) { +				spin_unlock(&ctl->tree_lock); +				goto out; +			} +			entry = rb_entry(node, struct btrfs_free_space, +					 offset_index);  		} -		/* -		 * we haven't filled the empty size and the window is -		 * very large.  reset and try again -		 */ -		if (next->offset - (last->offset + last->bytes) > 128 * 1024 || -		    next->offset - window_start > (bytes + empty_size) * 2) { -			entry = next; -			window_start = entry->offset; -			window_free = entry->bytes; -			last = entry; -			max_extent = entry->bytes; -		} else { -			last = next; -			window_free += next->bytes; -			if (entry->bytes > max_extent) -				max_extent = entry->bytes; +		if (entry->offset >= end) { +			spin_unlock(&ctl->tree_lock); +			break; +		} + +		extent_start = entry->offset; +		extent_bytes = entry->bytes; +		start = max(start, extent_start); +		bytes = min(extent_start + extent_bytes, end) - start; +		if (bytes < minlen) { +			spin_unlock(&ctl->tree_lock); +			goto next;  		} + +		unlink_free_space(ctl, entry); +		kmem_cache_free(btrfs_free_space_cachep, entry); + +		spin_unlock(&ctl->tree_lock); + +		ret = do_trimming(block_group, total_trimmed, start, bytes, +				  extent_start, extent_bytes); +		if (ret) +			break; +next: +		start += bytes; + +		if (fatal_signal_pending(current)) { +			ret = -ERESTARTSYS; +			break; +		} + +		cond_resched();  	} +out: +	return ret; +} -	cluster->window_start = entry->offset; +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, +			u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +	struct btrfs_free_space *entry; +	int ret = 0; +	int ret2; +	u64 bytes; +	u64 offset = offset_to_bitmap(ctl, start); -	/* -	 * now we've found our entries, pull them out of the free space -	 * cache and put them into the cluster rbtree -	 * -	 * The cluster includes an rbtree, but only uses the offset index -	 * of each free space cache entry. -	 */ -	while (1) { -		node = rb_next(&entry->offset_index); -		if (entry->bitmap && node) { -			entry = rb_entry(node, struct btrfs_free_space, -					 offset_index); -			continue; -		} else if (entry->bitmap && !node) { +	while (offset < end) { +		bool next_bitmap = false; + +		spin_lock(&ctl->tree_lock); + +		if (ctl->free_space < minlen) { +			spin_unlock(&ctl->tree_lock);  			break;  		} -		rb_erase(&entry->offset_index, &block_group->free_space_offset); -		ret = tree_insert_offset(&cluster->root, entry->offset, -					 &entry->offset_index, 0); -		BUG_ON(ret); +		entry = tree_search_offset(ctl, offset, 1, 0); +		if (!entry) { +			spin_unlock(&ctl->tree_lock); +			next_bitmap = true; +			goto next; +		} -		if (!node || entry == last) +		bytes = minlen; +		ret2 = search_bitmap(ctl, entry, &start, &bytes); +		if (ret2 || start >= end) { +			spin_unlock(&ctl->tree_lock); +			next_bitmap = true; +			goto next; +		} + +		bytes = min(bytes, end - start); +		if (bytes < minlen) { +			spin_unlock(&ctl->tree_lock); +			goto next; +		} + +		bitmap_clear_bits(ctl, entry, start, bytes); +		if (entry->bytes == 0) +			free_bitmap(ctl, entry); + +		spin_unlock(&ctl->tree_lock); + +		ret = do_trimming(block_group, total_trimmed, start, bytes, +				  start, bytes); +		if (ret) +			break; +next: +		if (next_bitmap) { +			offset += BITS_PER_BITMAP * ctl->unit; +		} else { +			start += bytes; +			if (start >= offset + BITS_PER_BITMAP * ctl->unit) +				offset += BITS_PER_BITMAP * ctl->unit; +		} + +		if (fatal_signal_pending(current)) { +			ret = -ERESTARTSYS;  			break; +		} -		entry = rb_entry(node, struct btrfs_free_space, offset_index); +		cond_resched();  	} -	cluster->max_size = max_extent; -got_it: -	ret = 0; -	atomic_inc(&block_group->count); -	list_add_tail(&cluster->block_group_list, &block_group->cluster_list); -	cluster->block_group = block_group; +	return ret; +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +			   u64 *trimmed, u64 start, u64 end, u64 minlen) +{ +	int ret; + +	*trimmed = 0; + +	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); +	if (ret) +		return ret; + +	ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + +	return ret; +} + +/* + * Find the left-most item in the cache tree, and then return the + * smallest inode number in the item. + * + * Note: the returned inode number may not be the smallest one in + * the tree, if the left-most item is a bitmap. + */ +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +{ +	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; +	struct btrfs_free_space *entry = NULL; +	u64 ino = 0; + +	spin_lock(&ctl->tree_lock); + +	if (RB_EMPTY_ROOT(&ctl->free_space_offset)) +		goto out; + +	entry = rb_entry(rb_first(&ctl->free_space_offset), +			 struct btrfs_free_space, offset_index); + +	if (!entry->bitmap) { +		ino = entry->offset; + +		unlink_free_space(ctl, entry); +		entry->offset++; +		entry->bytes--; +		if (!entry->bytes) +			kmem_cache_free(btrfs_free_space_cachep, entry); +		else +			link_free_space(ctl, entry); +	} else { +		u64 offset = 0; +		u64 count = 1; +		int ret; + +		ret = search_bitmap(ctl, entry, &offset, &count); +		/* Logic error; Should be empty if it can't find anything */ +		ASSERT(!ret); + +		ino = offset; +		bitmap_clear_bits(ctl, entry, offset, 1); +		if (entry->bytes == 0) +			free_bitmap(ctl, entry); +	}  out: -	spin_unlock(&cluster->lock); -	spin_unlock(&block_group->tree_lock); +	spin_unlock(&ctl->tree_lock); + +	return ino; +} + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, +				    struct btrfs_path *path) +{ +	struct inode *inode = NULL; + +	spin_lock(&root->cache_lock); +	if (root->cache_inode) +		inode = igrab(root->cache_inode); +	spin_unlock(&root->cache_lock); +	if (inode) +		return inode; + +	inode = __lookup_free_space_inode(root, path, 0); +	if (IS_ERR(inode)) +		return inode; + +	spin_lock(&root->cache_lock); +	if (!btrfs_fs_closing(root->fs_info)) +		root->cache_inode = igrab(inode); +	spin_unlock(&root->cache_lock); + +	return inode; +} + +int create_free_ino_inode(struct btrfs_root *root, +			  struct btrfs_trans_handle *trans, +			  struct btrfs_path *path) +{ +	return __create_free_space_inode(root, trans, path, +					 BTRFS_FREE_INO_OBJECTID, 0); +} + +int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ +	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; +	struct btrfs_path *path; +	struct inode *inode; +	int ret = 0; +	u64 root_gen = btrfs_root_generation(&root->root_item); + +	if (!btrfs_test_opt(root, INODE_MAP_CACHE)) +		return 0; + +	/* +	 * If we're unmounting then just return, since this does a search on the +	 * normal root and not the commit root and we could deadlock. +	 */ +	if (btrfs_fs_closing(fs_info)) +		return 0; +	path = btrfs_alloc_path(); +	if (!path) +		return 0; + +	inode = lookup_free_ino_inode(root, path); +	if (IS_ERR(inode)) +		goto out; + +	if (root_gen != BTRFS_I(inode)->generation) +		goto out_put; + +	ret = __load_free_space_cache(root, inode, ctl, path, 0); + +	if (ret < 0) +		btrfs_err(fs_info, +			"failed to load free ino cache for root %llu", +			root->root_key.objectid); +out_put: +	iput(inode); +out: +	btrfs_free_path(path);  	return ret;  } +int btrfs_write_out_ino_cache(struct btrfs_root *root, +			      struct btrfs_trans_handle *trans, +			      struct btrfs_path *path, +			      struct inode *inode) +{ +	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; +	int ret; + +	if (!btrfs_test_opt(root, INODE_MAP_CACHE)) +		return 0; + +	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); +	if (ret) { +		btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG +		btrfs_err(root->fs_info, +			"failed to write free ino cache for root %llu", +			root->root_key.objectid); +#endif +	} + +	return ret; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  /* - * simple code to zero out a cluster + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations.   */ -void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +			      u64 offset, u64 bytes, bool bitmap)  { -	spin_lock_init(&cluster->lock); -	spin_lock_init(&cluster->refill_lock); -	cluster->root = RB_ROOT; -	cluster->max_size = 0; -	cluster->points_to_bitmap = false; -	INIT_LIST_HEAD(&cluster->block_group_list); -	cluster->block_group = NULL; +	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; +	struct btrfs_free_space *info = NULL, *bitmap_info; +	void *map = NULL; +	u64 bytes_added; +	int ret; + +again: +	if (!info) { +		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); +		if (!info) +			return -ENOMEM; +	} + +	if (!bitmap) { +		spin_lock(&ctl->tree_lock); +		info->offset = offset; +		info->bytes = bytes; +		ret = link_free_space(ctl, info); +		spin_unlock(&ctl->tree_lock); +		if (ret) +			kmem_cache_free(btrfs_free_space_cachep, info); +		return ret; +	} + +	if (!map) { +		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); +		if (!map) { +			kmem_cache_free(btrfs_free_space_cachep, info); +			return -ENOMEM; +		} +	} + +	spin_lock(&ctl->tree_lock); +	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), +					 1, 0); +	if (!bitmap_info) { +		info->bitmap = map; +		map = NULL; +		add_new_bitmap(ctl, info, offset); +		bitmap_info = info; +	} + +	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); +	bytes -= bytes_added; +	offset += bytes_added; +	spin_unlock(&ctl->tree_lock); + +	if (bytes) +		goto again; + +	if (map) +		kfree(map); +	return 0;  } +/* + * Checks to see if the given range is in the free space cache.  This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +int test_check_exists(struct btrfs_block_group_cache *cache, +		      u64 offset, u64 bytes) +{ +	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; +	struct btrfs_free_space *info; +	int ret = 0; + +	spin_lock(&ctl->tree_lock); +	info = tree_search_offset(ctl, offset, 0, 0); +	if (!info) { +		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), +					  1, 0); +		if (!info) +			goto out; +	} + +have_info: +	if (info->bitmap) { +		u64 bit_off, bit_bytes; +		struct rb_node *n; +		struct btrfs_free_space *tmp; + +		bit_off = offset; +		bit_bytes = ctl->unit; +		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); +		if (!ret) { +			if (bit_off == offset) { +				ret = 1; +				goto out; +			} else if (bit_off > offset && +				   offset + bytes > bit_off) { +				ret = 1; +				goto out; +			} +		} + +		n = rb_prev(&info->offset_index); +		while (n) { +			tmp = rb_entry(n, struct btrfs_free_space, +				       offset_index); +			if (tmp->offset + tmp->bytes < offset) +				break; +			if (offset + bytes < tmp->offset) { +				n = rb_prev(&info->offset_index); +				continue; +			} +			info = tmp; +			goto have_info; +		} + +		n = rb_next(&info->offset_index); +		while (n) { +			tmp = rb_entry(n, struct btrfs_free_space, +				       offset_index); +			if (offset + bytes < tmp->offset) +				break; +			if (tmp->offset + tmp->bytes < offset) { +				n = rb_next(&info->offset_index); +				continue; +			} +			info = tmp; +			goto have_info; +		} + +		goto out; +	} + +	if (info->offset == offset) { +		ret = 1; +		goto out; +	} + +	if (offset > info->offset && offset < info->offset + info->bytes) +		ret = 1; +out: +	spin_unlock(&ctl->tree_lock); +	return ret; +} +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */  | 
