diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 6791 | 
1 files changed, 4391 insertions, 2400 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 558cac2dfa5..3668048e16f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,27 +32,36 @@  #include <linux/writeback.h>  #include <linux/statfs.h>  #include <linux/compat.h> +#include <linux/aio.h>  #include <linux/bit_spinlock.h>  #include <linux/xattr.h>  #include <linux/posix_acl.h>  #include <linux/falloc.h>  #include <linux/slab.h> -#include "compat.h" +#include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/btrfs.h> +#include <linux/blkdev.h> +#include <linux/posix_acl_xattr.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h"  #include "btrfs_inode.h" -#include "ioctl.h"  #include "print-tree.h" -#include "volumes.h"  #include "ordered-data.h"  #include "xattr.h"  #include "tree-log.h" +#include "volumes.h"  #include "compression.h"  #include "locking.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "backref.h" +#include "hash.h" +#include "props.h"  struct btrfs_iget_args { -	u64 ino; +	struct btrfs_key *location;  	struct btrfs_root *root;  }; @@ -67,9 +76,11 @@ static const struct file_operations btrfs_dir_file_operations;  static struct extent_io_ops btrfs_extent_io_ops;  static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep;  struct kmem_cache *btrfs_trans_handle_cachep;  struct kmem_cache *btrfs_transaction_cachep;  struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep;  #define S_SHIFT 12  static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -82,21 +93,30 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {  	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,  }; -static void btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); +static int btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);  static noinline int cow_file_range(struct inode *inode,  				   struct page *locked_page,  				   u64 start, u64 end, int *page_started,  				   unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, +					   u64 len, u64 orig_start, +					   u64 block_start, u64 block_len, +					   u64 orig_block_len, u64 ram_bytes, +					   int type); + +static int btrfs_dirty_inode(struct inode *inode);  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, -				     struct inode *inode,  struct inode *dir) +				     struct inode *inode,  struct inode *dir, +				     const struct qstr *qstr)  {  	int err;  	err = btrfs_init_acl(trans, inode, dir);  	if (!err) -		err = btrfs_xattr_security_init(trans, inode, dir); +		err = btrfs_xattr_security_init(trans, inode, dir, qstr);  	return err;  } @@ -105,13 +125,13 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,   * the btree.  The caller should have done a btrfs_drop_extents so that   * no overlapping inline items exist in the btree   */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, +				struct btrfs_path *path, int extent_inserted,  				struct btrfs_root *root, struct inode *inode,  				u64 start, size_t size, size_t compressed_size, +				int compress_type,  				struct page **compressed_pages)  { -	struct btrfs_key key; -	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct page *page = NULL;  	char *kaddr; @@ -120,34 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  	int err = 0;  	int ret;  	size_t cur_size = size; -	size_t datasize;  	unsigned long offset; -	int use_compress = 0; -	if (compressed_size && compressed_pages) { -		use_compress = 1; +	if (compressed_size && compressed_pages)  		cur_size = compressed_size; -	} -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; +	inode_add_bytes(inode, size); -	path->leave_spinning = 1; -	btrfs_set_trans_block_group(trans, inode); +	if (!extent_inserted) { +		struct btrfs_key key; +		size_t datasize; -	key.objectid = inode->i_ino; -	key.offset = start; -	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); -	datasize = btrfs_file_extent_calc_inline_size(cur_size); +		key.objectid = btrfs_ino(inode); +		key.offset = start; +		btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); -	inode_add_bytes(inode, size); -	ret = btrfs_insert_empty_item(trans, root, path, &key, -				      datasize); -	BUG_ON(ret); -	if (ret) { -		err = ret; -		goto fail; +		datasize = btrfs_file_extent_calc_inline_size(cur_size); +		path->leave_spinning = 1; +		ret = btrfs_insert_empty_item(trans, root, path, &key, +					      datasize); +		if (ret) { +			err = ret; +			goto fail; +		}  	}  	leaf = path->nodes[0];  	ei = btrfs_item_ptr(leaf, path->slots[0], @@ -159,7 +174,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_ram_bytes(leaf, ei, size);  	ptr = btrfs_file_extent_inline_start(ei); -	if (use_compress) { +	if (compress_type != BTRFS_COMPRESS_NONE) {  		struct page *cpage;  		int i = 0;  		while (compressed_size > 0) { @@ -167,28 +182,28 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  			cur_size = min_t(unsigned long, compressed_size,  				       PAGE_CACHE_SIZE); -			kaddr = kmap_atomic(cpage, KM_USER0); +			kaddr = kmap_atomic(cpage);  			write_extent_buffer(leaf, kaddr, ptr, cur_size); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			i++;  			ptr += cur_size;  			compressed_size -= cur_size;  		}  		btrfs_set_file_extent_compression(leaf, ei, -						  BTRFS_COMPRESS_ZLIB); +						  compress_type);  	} else {  		page = find_get_page(inode->i_mapping,  				     start >> PAGE_CACHE_SHIFT);  		btrfs_set_file_extent_compression(leaf, ei, 0); -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		offset = start & (PAGE_CACHE_SIZE - 1);  		write_extent_buffer(leaf, kaddr + offset, ptr, size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		page_cache_release(page);  	}  	btrfs_mark_buffer_dirty(leaf); -	btrfs_free_path(path); +	btrfs_release_path(path);  	/*  	 * we're an inline extent, so nobody can @@ -200,11 +215,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  	 * could end up racing with unlink.  	 */  	BTRFS_I(inode)->disk_i_size = inode->i_size; -	btrfs_update_inode(trans, root, inode); +	ret = btrfs_update_inode(trans, root, inode); -	return 0; +	return ret;  fail: -	btrfs_free_path(path);  	return err;  } @@ -214,20 +228,22 @@ fail:   * does the checks required to make sure the data is small enough   * to fit as an inline extent.   */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, -				 struct btrfs_root *root, -				 struct inode *inode, u64 start, u64 end, -				 size_t compressed_size, -				 struct page **compressed_pages) +static noinline int cow_file_range_inline(struct btrfs_root *root, +					  struct inode *inode, u64 start, +					  u64 end, size_t compressed_size, +					  int compress_type, +					  struct page **compressed_pages)  { +	struct btrfs_trans_handle *trans;  	u64 isize = i_size_read(inode);  	u64 actual_end = min(end + 1, isize);  	u64 inline_len = actual_end - start; -	u64 aligned_end = (end + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); -	u64 hint_byte; +	u64 aligned_end = ALIGN(end, root->sectorsize);  	u64 data_len = inline_len;  	int ret; +	struct btrfs_path *path; +	int extent_inserted = 0; +	u32 extent_item_size;  	if (compressed_size)  		data_len = compressed_size; @@ -242,19 +258,53 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,  		return 1;  	} -	ret = btrfs_drop_extents(trans, inode, start, aligned_end, -				 &hint_byte, 1); -	BUG_ON(ret); +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		btrfs_free_path(path); +		return PTR_ERR(trans); +	} +	trans->block_rsv = &root->fs_info->delalloc_block_rsv; + +	if (compressed_size && compressed_pages) +		extent_item_size = btrfs_file_extent_calc_inline_size( +		   compressed_size); +	else +		extent_item_size = btrfs_file_extent_calc_inline_size( +		    inline_len); + +	ret = __btrfs_drop_extents(trans, root, inode, path, +				   start, aligned_end, NULL, +				   1, 1, extent_item_size, &extent_inserted); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	if (isize > actual_end)  		inline_len = min_t(u64, isize, actual_end); -	ret = insert_inline_extent(trans, root, inode, start, +	ret = insert_inline_extent(trans, path, extent_inserted, +				   root, inode, start,  				   inline_len, compressed_size, -				   compressed_pages); -	BUG_ON(ret); +				   compress_type, compressed_pages); +	if (ret && ret != -ENOSPC) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	} else if (ret == -ENOSPC) { +		ret = 1; +		goto out; +	} + +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);  	btrfs_delalloc_release_metadata(inode, end + 1 - start);  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); -	return 0; +out: +	btrfs_free_path(path); +	btrfs_end_transaction(trans, root); +	return ret;  }  struct async_extent { @@ -263,6 +313,7 @@ struct async_extent {  	u64 compressed_size;  	struct page **pages;  	unsigned long nr_pages; +	int compress_type;  	struct list_head list;  }; @@ -280,16 +331,19 @@ static noinline int add_async_extent(struct async_cow *cow,  				     u64 start, u64 ram_size,  				     u64 compressed_size,  				     struct page **pages, -				     unsigned long nr_pages) +				     unsigned long nr_pages, +				     int compress_type)  {  	struct async_extent *async_extent;  	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); +	BUG_ON(!async_extent); /* -ENOMEM */  	async_extent->start = start;  	async_extent->ram_size = ram_size;  	async_extent->compressed_size = compressed_size;  	async_extent->pages = pages;  	async_extent->nr_pages = nr_pages; +	async_extent->compress_type = compress_type;  	list_add_tail(&async_extent->list, &cow->extents);  	return 0;  } @@ -308,7 +362,8 @@ static noinline int add_async_extent(struct async_cow *cow,   * If this code finds it can't get good compression, it puts an   * entry onto the work queue to write the uncompressed bytes.  This   * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that pdflush sent them down. + * are written in the same order that the flusher thread sent them + * down.   */  static noinline int compress_file_range(struct inode *inode,  					struct page *locked_page, @@ -317,7 +372,6 @@ static noinline int compress_file_range(struct inode *inode,  					int *num_added)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	u64 num_bytes;  	u64 blocksize = root->sectorsize;  	u64 actual_end; @@ -332,6 +386,21 @@ static noinline int compress_file_range(struct inode *inode,  	unsigned long max_uncompressed = 128 * 1024;  	int i;  	int will_compress; +	int compress_type = root->fs_info->compress_type; +	int redirty = 0; + +	/* if this is a small write inside eof, kick off a defrag */ +	if ((end - start + 1) < 16 * 1024 && +	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) +		btrfs_add_inode_defrag(NULL, inode); + +	/* +	 * skip compression for a small file range(<=blocksize) that +	 * isn't an inline extent, since it dosen't save disk space at all. +	 */ +	if ((end - start + 1) <= blocksize && +	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) +		goto cleanup_and_bail_uncompressed;  	actual_end = min_t(u64, isize, end + 1);  again: @@ -365,7 +434,7 @@ again:  	 * a compressed extent to 128k.  	 */  	total_compressed = min(total_compressed, max_uncompressed); -	num_bytes = (end - start + blocksize) & ~(blocksize - 1); +	num_bytes = ALIGN(end - start + 1, blocksize);  	num_bytes = max(blocksize,  num_bytes);  	total_in = 0;  	ret = 0; @@ -377,16 +446,36 @@ again:  	 */  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&  	    (btrfs_test_opt(root, COMPRESS) || -	     (BTRFS_I(inode)->force_compress))) { +	     (BTRFS_I(inode)->force_compress) || +	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {  		WARN_ON(pages);  		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); +		if (!pages) { +			/* just bail out to the uncompressed code */ +			goto cont; +		} + +		if (BTRFS_I(inode)->force_compress) +			compress_type = BTRFS_I(inode)->force_compress; -		ret = btrfs_zlib_compress_pages(inode->i_mapping, start, -						total_compressed, pages, -						nr_pages, &nr_pages_ret, -						&total_in, -						&total_compressed, -						max_compressed); +		/* +		 * we need to call clear_page_dirty_for_io on each +		 * page in the range.  Otherwise applications with the file +		 * mmap'd can wander in and change the page contents while +		 * we are compressing them. +		 * +		 * If the compression fails for any reason, we set the pages +		 * dirty again later on. +		 */ +		extent_range_clear_dirty_for_io(inode, start, end); +		redirty = 1; +		ret = btrfs_compress_pages(compress_type, +					   inode->i_mapping, start, +					   total_compressed, pages, +					   nr_pages, &nr_pages_ret, +					   &total_in, +					   &total_compressed, +					   max_compressed);  		if (!ret) {  			unsigned long offset = total_compressed & @@ -398,50 +487,46 @@ again:  			 * sending it down to disk  			 */  			if (offset) { -				kaddr = kmap_atomic(page, KM_USER0); +				kaddr = kmap_atomic(page);  				memset(kaddr + offset, 0,  				       PAGE_CACHE_SIZE - offset); -				kunmap_atomic(kaddr, KM_USER0); +				kunmap_atomic(kaddr);  			}  			will_compress = 1;  		}  	} +cont:  	if (start == 0) { -		trans = btrfs_join_transaction(root, 1); -		BUG_ON(!trans); -		btrfs_set_trans_block_group(trans, inode); -		trans->block_rsv = &root->fs_info->delalloc_block_rsv; -  		/* lets try to make an inline extent */  		if (ret || total_in < (actual_end - start)) {  			/* we didn't compress the entire range, try  			 * to make an uncompressed inline extent.  			 */ -			ret = cow_file_range_inline(trans, root, inode, -						    start, end, 0, NULL); +			ret = cow_file_range_inline(root, inode, start, end, +						    0, 0, NULL);  		} else {  			/* try making a compressed inline extent */ -			ret = cow_file_range_inline(trans, root, inode, -						    start, end, -						    total_compressed, pages); +			ret = cow_file_range_inline(root, inode, start, end, +						    total_compressed, +						    compress_type, pages);  		} -		if (ret == 0) { +		if (ret <= 0) { +			unsigned long clear_flags = EXTENT_DELALLOC | +				EXTENT_DEFRAG; +			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; +  			/* -			 * inline extent creation worked, we don't need -			 * to create any more async work items.  Unlock -			 * and free up our temp pages. +			 * inline extent creation worked or returned error, +			 * we don't need to create any more async work items. +			 * Unlock and free up our temp pages.  			 */ -			extent_clear_unlock_delalloc(inode, -			     &BTRFS_I(inode)->io_tree, -			     start, end, NULL, -			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | -			     EXTENT_CLEAR_DELALLOC | -			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - -			btrfs_end_transaction(trans, root); +			extent_clear_unlock_delalloc(inode, start, end, NULL, +						     clear_flags, PAGE_UNLOCK | +						     PAGE_CLEAR_DIRTY | +						     PAGE_SET_WRITEBACK | +						     PAGE_END_WRITEBACK);  			goto free_pages_out;  		} -		btrfs_end_transaction(trans, root);  	}  	if (will_compress) { @@ -450,15 +535,13 @@ again:  		 * up to a block size boundary so the allocator does sane  		 * things  		 */ -		total_compressed = (total_compressed + blocksize - 1) & -			~(blocksize - 1); +		total_compressed = ALIGN(total_compressed, blocksize);  		/*  		 * one last check to make sure the compression is really a  		 * win, compare the page count read with the blocks on disk  		 */ -		total_in = (total_in + PAGE_CACHE_SIZE - 1) & -			~(PAGE_CACHE_SIZE - 1); +		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);  		if (total_compressed >= total_in) {  			will_compress = 0;  		} else { @@ -493,9 +576,10 @@ again:  		 * and will submit them to the elevator.  		 */  		add_async_extent(async_cow, start, num_bytes, -				 total_compressed, pages, nr_pages_ret); +				 total_compressed, pages, nr_pages_ret, +				 compress_type); -		if (start + num_bytes < end && start + num_bytes < actual_end) { +		if (start + num_bytes < end) {  			start += num_bytes;  			pages = NULL;  			cond_resched(); @@ -515,12 +599,15 @@ cleanup_and_bail_uncompressed:  			__set_page_dirty_nobuffers(locked_page);  			/* unlocked later on in the async handlers */  		} -		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); +		if (redirty) +			extent_range_redirty_for_io(inode, start, end); +		add_async_extent(async_cow, start, end - start + 1, +				 0, NULL, 0, BTRFS_COMPRESS_NONE);  		*num_added += 1;  	}  out: -	return 0; +	return ret;  free_pages_out:  	for (i = 0; i < nr_pages_ret; i++) { @@ -543,7 +630,6 @@ static noinline int submit_compressed_extents(struct inode *inode,  {  	struct async_extent *async_extent;  	u64 alloc_hint = 0; -	struct btrfs_trans_handle *trans;  	struct btrfs_key ins;  	struct extent_map *em;  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -554,7 +640,7 @@ static noinline int submit_compressed_extents(struct inode *inode,  	if (list_empty(&async_cow->extents))  		return 0; - +again:  	while (!list_empty(&async_cow->extents)) {  		async_extent = list_entry(async_cow->extents.next,  					  struct async_extent, list); @@ -570,7 +656,7 @@ retry:  			lock_extent(io_tree, async_extent->start,  					 async_extent->start + -					 async_extent->ram_size - 1, GFP_NOFS); +					 async_extent->ram_size - 1);  			/* allocate blocks */  			ret = cow_file_range(inode, async_cow->locked_page, @@ -579,6 +665,8 @@ retry:  					     async_extent->ram_size - 1,  					     &page_started, &nr_written, 0); +			/* JDM XXX */ +  			/*  			 * if page_started, cow_file_range inserted an  			 * inline extent and took care of all the unlocking @@ -592,25 +680,23 @@ retry:  						  async_extent->ram_size - 1,  						  btrfs_get_extent,  						  WB_SYNC_ALL); +			else if (ret) +				unlock_page(async_cow->locked_page);  			kfree(async_extent);  			cond_resched();  			continue;  		}  		lock_extent(io_tree, async_extent->start, -			    async_extent->start + async_extent->ram_size - 1, -			    GFP_NOFS); +			    async_extent->start + async_extent->ram_size - 1); -		trans = btrfs_join_transaction(root, 1); -		ret = btrfs_reserve_extent(trans, root, +		ret = btrfs_reserve_extent(root,  					   async_extent->compressed_size,  					   async_extent->compressed_size, -					   0, alloc_hint, -					   (u64)-1, &ins, 1); -		btrfs_end_transaction(trans, root); - +					   0, alloc_hint, &ins, 1, 1);  		if (ret) {  			int i; +  			for (i = 0; i < async_extent->nr_pages; i++) {  				WARN_ON(async_extent->pages[i]->mapping);  				page_cache_release(async_extent->pages[i]); @@ -618,10 +704,14 @@ retry:  			kfree(async_extent->pages);  			async_extent->nr_pages = 0;  			async_extent->pages = NULL; -			unlock_extent(io_tree, async_extent->start, -				      async_extent->start + -				      async_extent->ram_size - 1, GFP_NOFS); -			goto retry; + +			if (ret == -ENOSPC) { +				unlock_extent(io_tree, async_extent->start, +					      async_extent->start + +					      async_extent->ram_size - 1); +				goto retry; +			} +			goto out_free;  		}  		/* @@ -632,20 +722,30 @@ retry:  					async_extent->start +  					async_extent->ram_size - 1, 0); -		em = alloc_extent_map(GFP_NOFS); +		em = alloc_extent_map(); +		if (!em) { +			ret = -ENOMEM; +			goto out_free_reserve; +		}  		em->start = async_extent->start;  		em->len = async_extent->ram_size;  		em->orig_start = em->start; +		em->mod_start = em->start; +		em->mod_len = em->len;  		em->block_start = ins.objectid;  		em->block_len = ins.offset; +		em->orig_block_len = ins.offset; +		em->ram_bytes = async_extent->ram_size;  		em->bdev = root->fs_info->fs_devices->latest_bdev; +		em->compress_type = async_extent->compress_type;  		set_bit(EXTENT_FLAG_PINNED, &em->flags);  		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +		em->generation = -1;  		while (1) {  			write_lock(&em_tree->lock); -			ret = add_extent_mapping(em_tree, em); +			ret = add_extent_mapping(em_tree, em, 1);  			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em); @@ -656,40 +756,55 @@ retry:  						async_extent->ram_size - 1, 0);  		} -		ret = btrfs_add_ordered_extent(inode, async_extent->start, -					       ins.objectid, -					       async_extent->ram_size, -					       ins.offset, -					       BTRFS_ORDERED_COMPRESSED); -		BUG_ON(ret); +		if (ret) +			goto out_free_reserve; + +		ret = btrfs_add_ordered_extent_compress(inode, +						async_extent->start, +						ins.objectid, +						async_extent->ram_size, +						ins.offset, +						BTRFS_ORDERED_COMPRESSED, +						async_extent->compress_type); +		if (ret) +			goto out_free_reserve;  		/*  		 * clear dirty, set writeback and unlock the pages.  		 */ -		extent_clear_unlock_delalloc(inode, -				&BTRFS_I(inode)->io_tree, -				async_extent->start, +		extent_clear_unlock_delalloc(inode, async_extent->start,  				async_extent->start +  				async_extent->ram_size - 1, -				NULL, EXTENT_CLEAR_UNLOCK_PAGE | -				EXTENT_CLEAR_UNLOCK | -				EXTENT_CLEAR_DELALLOC | -				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); - +				NULL, EXTENT_LOCKED | EXTENT_DELALLOC, +				PAGE_UNLOCK | PAGE_CLEAR_DIRTY | +				PAGE_SET_WRITEBACK);  		ret = btrfs_submit_compressed_write(inode,  				    async_extent->start,  				    async_extent->ram_size,  				    ins.objectid,  				    ins.offset, async_extent->pages,  				    async_extent->nr_pages); - -		BUG_ON(ret);  		alloc_hint = ins.objectid + ins.offset;  		kfree(async_extent); +		if (ret) +			goto out;  		cond_resched();  	} - -	return 0; +	ret = 0; +out: +	return ret; +out_free_reserve: +	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_free: +	extent_clear_unlock_delalloc(inode, async_extent->start, +				     async_extent->start + +				     async_extent->ram_size - 1, +				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC | +				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, +				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY | +				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); +	kfree(async_extent); +	goto again;  }  static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -744,7 +859,6 @@ static noinline int cow_file_range(struct inode *inode,  				   int unlock)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	u64 alloc_hint = 0;  	u64 num_bytes;  	unsigned long ram_size; @@ -756,42 +870,43 @@ static noinline int cow_file_range(struct inode *inode,  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	int ret = 0; -	BUG_ON(root == root->fs_info->tree_root); -	trans = btrfs_join_transaction(root, 1); -	BUG_ON(!trans); -	btrfs_set_trans_block_group(trans, inode); -	trans->block_rsv = &root->fs_info->delalloc_block_rsv; +	if (btrfs_is_free_space_inode(inode)) { +		WARN_ON_ONCE(1); +		ret = -EINVAL; +		goto out_unlock; +	} -	num_bytes = (end - start + blocksize) & ~(blocksize - 1); +	num_bytes = ALIGN(end - start + 1, blocksize);  	num_bytes = max(blocksize,  num_bytes);  	disk_num_bytes = num_bytes; -	ret = 0; + +	/* if this is a small write inside eof, kick off defrag */ +	if (num_bytes < 64 * 1024 && +	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) +		btrfs_add_inode_defrag(NULL, inode);  	if (start == 0) {  		/* lets try to make an inline extent */ -		ret = cow_file_range_inline(trans, root, inode, -					    start, end, 0, NULL); +		ret = cow_file_range_inline(root, inode, start, end, 0, 0, +					    NULL);  		if (ret == 0) { -			extent_clear_unlock_delalloc(inode, -				     &BTRFS_I(inode)->io_tree, -				     start, end, NULL, -				     EXTENT_CLEAR_UNLOCK_PAGE | -				     EXTENT_CLEAR_UNLOCK | -				     EXTENT_CLEAR_DELALLOC | -				     EXTENT_CLEAR_DIRTY | -				     EXTENT_SET_WRITEBACK | -				     EXTENT_END_WRITEBACK); +			extent_clear_unlock_delalloc(inode, start, end, NULL, +				     EXTENT_LOCKED | EXTENT_DELALLOC | +				     EXTENT_DEFRAG, PAGE_UNLOCK | +				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | +				     PAGE_END_WRITEBACK);  			*nr_written = *nr_written +  			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;  			*page_started = 1; -			ret = 0;  			goto out; +		} else if (ret < 0) { +			goto out_unlock;  		}  	}  	BUG_ON(disk_num_bytes > -	       btrfs_super_total_bytes(&root->fs_info->super_copy)); +	       btrfs_super_total_bytes(root->fs_info->super_copy));  	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);  	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -800,25 +915,35 @@ static noinline int cow_file_range(struct inode *inode,  		unsigned long op;  		cur_alloc_size = disk_num_bytes; -		ret = btrfs_reserve_extent(trans, root, cur_alloc_size, +		ret = btrfs_reserve_extent(root, cur_alloc_size,  					   root->sectorsize, 0, alloc_hint, -					   (u64)-1, &ins, 1); -		BUG_ON(ret); +					   &ins, 1, 1); +		if (ret < 0) +			goto out_unlock; -		em = alloc_extent_map(GFP_NOFS); +		em = alloc_extent_map(); +		if (!em) { +			ret = -ENOMEM; +			goto out_reserve; +		}  		em->start = start;  		em->orig_start = em->start;  		ram_size = ins.offset;  		em->len = ins.offset; +		em->mod_start = em->start; +		em->mod_len = em->len;  		em->block_start = ins.objectid;  		em->block_len = ins.offset; +		em->orig_block_len = ins.offset; +		em->ram_bytes = ram_size;  		em->bdev = root->fs_info->fs_devices->latest_bdev;  		set_bit(EXTENT_FLAG_PINNED, &em->flags); +		em->generation = -1;  		while (1) {  			write_lock(&em_tree->lock); -			ret = add_extent_mapping(em_tree, em); +			ret = add_extent_mapping(em_tree, em, 1);  			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em); @@ -827,17 +952,21 @@ static noinline int cow_file_range(struct inode *inode,  			btrfs_drop_extent_cache(inode, start,  						start + ram_size - 1, 0);  		} +		if (ret) +			goto out_reserve;  		cur_alloc_size = ins.offset;  		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,  					       ram_size, cur_alloc_size, 0); -		BUG_ON(ret); +		if (ret) +			goto out_reserve;  		if (root->root_key.objectid ==  		    BTRFS_DATA_RELOC_TREE_OBJECTID) {  			ret = btrfs_reloc_clone_csums(inode, start,  						      cur_alloc_size); -			BUG_ON(ret); +			if (ret) +				goto out_reserve;  		}  		if (disk_num_bytes < cur_alloc_size) @@ -850,23 +979,30 @@ static noinline int cow_file_range(struct inode *inode,  		 * Do set the Private2 bit so we know this page was properly  		 * setup for writepage  		 */ -		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; -		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | -			EXTENT_SET_PRIVATE2; +		op = unlock ? PAGE_UNLOCK : 0; +		op |= PAGE_SET_PRIVATE2; -		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, -					     start, start + ram_size - 1, -					     locked_page, op); +		extent_clear_unlock_delalloc(inode, start, +					     start + ram_size - 1, locked_page, +					     EXTENT_LOCKED | EXTENT_DELALLOC, +					     op);  		disk_num_bytes -= cur_alloc_size;  		num_bytes -= cur_alloc_size;  		alloc_hint = ins.objectid + ins.offset;  		start += cur_alloc_size;  	}  out: -	ret = 0; -	btrfs_end_transaction(trans, root); -  	return ret; + +out_reserve: +	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_unlock: +	extent_clear_unlock_delalloc(inode, start, end, locked_page, +				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | +				     EXTENT_DELALLOC | EXTENT_DEFRAG, +				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY | +				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); +	goto out;  }  /* @@ -881,8 +1017,10 @@ static noinline void async_cow_start(struct btrfs_work *work)  	compress_file_range(async_cow->inode, async_cow->locked_page,  			    async_cow->start, async_cow->end, async_cow,  			    &num_added); -	if (num_added == 0) +	if (num_added == 0) { +		btrfs_add_delayed_iput(async_cow->inode);  		async_cow->inode = NULL; +	}  }  /* @@ -900,10 +1038,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)  	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>  		PAGE_CACHE_SHIFT; -	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); - -	if (atomic_read(&root->fs_info->async_delalloc_pages) < -	    5 * 1042 * 1024 && +	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < +	    5 * 1024 * 1024 &&  	    waitqueue_active(&root->fs_info->async_submit_wait))  		wake_up(&root->fs_info->async_submit_wait); @@ -915,6 +1051,8 @@ static noinline void async_cow_free(struct btrfs_work *work)  {  	struct async_cow *async_cow;  	async_cow = container_of(work, struct async_cow, work); +	if (async_cow->inode) +		btrfs_add_delayed_iput(async_cow->inode);  	kfree(async_cow);  } @@ -926,13 +1064,14 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  	struct btrfs_root *root = BTRFS_I(inode)->root;  	unsigned long nr_pages;  	u64 cur_end; -	int limit = 10 * 1024 * 1042; +	int limit = 10 * 1024 * 1024;  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,  			 1, 0, NULL, GFP_NOFS);  	while (start < end) {  		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); -		async_cow->inode = inode; +		BUG_ON(!async_cow); /* -ENOMEM */ +		async_cow->inode = igrab(inode);  		async_cow->root = root;  		async_cow->locked_page = locked_page;  		async_cow->start = start; @@ -945,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,  		async_cow->end = cur_end;  		INIT_LIST_HEAD(&async_cow->extents); -		async_cow->work.func = async_cow_start; -		async_cow->work.ordered_func = async_cow_submit; -		async_cow->work.ordered_free = async_cow_free; -		async_cow->work.flags = 0; +		btrfs_init_work(&async_cow->work, async_cow_start, +				async_cow_submit, async_cow_free);  		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>  			PAGE_CACHE_SHIFT;  		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); -		btrfs_queue_worker(&root->fs_info->delalloc_workers, -				   &async_cow->work); +		btrfs_queue_work(root->fs_info->delalloc_workers, +				 &async_cow->work);  		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {  			wait_event(root->fs_info->async_submit_wait, @@ -985,7 +1122,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,  	LIST_HEAD(list);  	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, -				       bytenr + num_bytes - 1, &list); +				       bytenr + num_bytes - 1, &list, 0);  	if (ret == 0 && list_empty(&list))  		return 0; @@ -1021,34 +1158,61 @@ static noinline int run_delalloc_nocow(struct inode *inode,  	u64 extent_offset;  	u64 disk_bytenr;  	u64 num_bytes; +	u64 disk_num_bytes; +	u64 ram_bytes;  	int extent_type; -	int ret; +	int ret, err;  	int type;  	int nocow;  	int check_prev = 1; -	bool nolock = false; +	bool nolock; +	u64 ino = btrfs_ino(inode);  	path = btrfs_alloc_path(); -	BUG_ON(!path); -	if (root == root->fs_info->tree_root) { -		nolock = true; -		trans = btrfs_join_transaction_nolock(root, 1); -	} else { -		trans = btrfs_join_transaction(root, 1); +	if (!path) { +		extent_clear_unlock_delalloc(inode, start, end, locked_page, +					     EXTENT_LOCKED | EXTENT_DELALLOC | +					     EXTENT_DO_ACCOUNTING | +					     EXTENT_DEFRAG, PAGE_UNLOCK | +					     PAGE_CLEAR_DIRTY | +					     PAGE_SET_WRITEBACK | +					     PAGE_END_WRITEBACK); +		return -ENOMEM; +	} + +	nolock = btrfs_is_free_space_inode(inode); + +	if (nolock) +		trans = btrfs_join_transaction_nolock(root); +	else +		trans = btrfs_join_transaction(root); + +	if (IS_ERR(trans)) { +		extent_clear_unlock_delalloc(inode, start, end, locked_page, +					     EXTENT_LOCKED | EXTENT_DELALLOC | +					     EXTENT_DO_ACCOUNTING | +					     EXTENT_DEFRAG, PAGE_UNLOCK | +					     PAGE_CLEAR_DIRTY | +					     PAGE_SET_WRITEBACK | +					     PAGE_END_WRITEBACK); +		btrfs_free_path(path); +		return PTR_ERR(trans);  	} -	BUG_ON(!trans); + +	trans->block_rsv = &root->fs_info->delalloc_block_rsv;  	cow_start = (u64)-1;  	cur_offset = start;  	while (1) { -		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, +		ret = btrfs_lookup_file_extent(trans, root, path, ino,  					       cur_offset, 0); -		BUG_ON(ret < 0); +		if (ret < 0) +			goto error;  		if (ret > 0 && path->slots[0] > 0 && check_prev) {  			leaf = path->nodes[0];  			btrfs_item_key_to_cpu(leaf, &found_key,  					      path->slots[0] - 1); -			if (found_key.objectid == inode->i_ino && +			if (found_key.objectid == ino &&  			    found_key.type == BTRFS_EXTENT_DATA_KEY)  				path->slots[0]--;  		} @@ -1058,7 +1222,7 @@ next_slot:  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {  			ret = btrfs_next_leaf(root, path);  			if (ret < 0) -				BUG_ON(1); +				goto error;  			if (ret > 0)  				break;  			leaf = path->nodes[0]; @@ -1069,7 +1233,7 @@ next_slot:  		num_bytes = 0;  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.objectid > inode->i_ino || +		if (found_key.objectid > ino ||  		    found_key.type > BTRFS_EXTENT_DATA_KEY ||  		    found_key.offset > end)  			break; @@ -1084,12 +1248,15 @@ next_slot:  				    struct btrfs_file_extent_item);  		extent_type = btrfs_file_extent_type(leaf, fi); +		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);  		if (extent_type == BTRFS_FILE_EXTENT_REG ||  		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {  			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);  			extent_offset = btrfs_file_extent_offset(leaf, fi);  			extent_end = found_key.offset +  				btrfs_file_extent_num_bytes(leaf, fi); +			disk_num_bytes = +				btrfs_file_extent_disk_num_bytes(leaf, fi);  			if (extent_end <= start) {  				path->slots[0]++;  				goto next_slot; @@ -1104,7 +1271,7 @@ next_slot:  				goto out_check;  			if (btrfs_extent_readonly(root, disk_bytenr))  				goto out_check; -			if (btrfs_cross_ref_exist(trans, root, inode->i_ino, +			if (btrfs_cross_ref_exist(trans, root, ino,  						  found_key.offset -  						  extent_offset, disk_bytenr))  				goto out_check; @@ -1112,6 +1279,15 @@ next_slot:  			disk_bytenr += cur_offset - found_key.offset;  			num_bytes = min(end + 1, extent_end) - cur_offset;  			/* +			 * if there are pending snapshots for this root, +			 * we fall into common COW way. +			 */ +			if (!nolock) { +				err = btrfs_start_nocow_write(root); +				if (!err) +					goto out_check; +			} +			/*  			 * force cow if csum exists in the range.  			 * this ensure that csum for a given extent are  			 * either valid or do not exist. @@ -1121,7 +1297,8 @@ next_slot:  			nocow = 1;  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  			extent_end = found_key.offset + -				btrfs_file_extent_inline_len(leaf, fi); +				btrfs_file_extent_inline_len(leaf, +						     path->slots[0], fi);  			extent_end = ALIGN(extent_end, root->sectorsize);  		} else {  			BUG_ON(1); @@ -1129,6 +1306,8 @@ next_slot:  out_check:  		if (extent_end <= start) {  			path->slots[0]++; +			if (!nolock && nocow) +				btrfs_end_nocow_write(root);  			goto next_slot;  		}  		if (!nocow) { @@ -1141,12 +1320,16 @@ out_check:  			goto next_slot;  		} -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		if (cow_start != (u64)-1) { -			ret = cow_file_range(inode, locked_page, cow_start, -					found_key.offset - 1, page_started, -					nr_written, 1); -			BUG_ON(ret); +			ret = cow_file_range(inode, locked_page, +					     cow_start, found_key.offset - 1, +					     page_started, nr_written, 1); +			if (ret) { +				if (!nolock && nocow) +					btrfs_end_nocow_write(root); +				goto error; +			}  			cow_start = (u64)-1;  		} @@ -1154,17 +1337,24 @@ out_check:  			struct extent_map *em;  			struct extent_map_tree *em_tree;  			em_tree = &BTRFS_I(inode)->extent_tree; -			em = alloc_extent_map(GFP_NOFS); +			em = alloc_extent_map(); +			BUG_ON(!em); /* -ENOMEM */  			em->start = cur_offset; -			em->orig_start = em->start; +			em->orig_start = found_key.offset - extent_offset;  			em->len = num_bytes;  			em->block_len = num_bytes;  			em->block_start = disk_bytenr; +			em->orig_block_len = disk_num_bytes; +			em->ram_bytes = ram_bytes;  			em->bdev = root->fs_info->fs_devices->latest_bdev; +			em->mod_start = em->start; +			em->mod_len = em->len;  			set_bit(EXTENT_FLAG_PINNED, &em->flags); +			set_bit(EXTENT_FLAG_FILLING, &em->flags); +			em->generation = -1;  			while (1) {  				write_lock(&em_tree->lock); -				ret = add_extent_mapping(em_tree, em); +				ret = add_extent_mapping(em_tree, em, 1);  				write_unlock(&em_tree->lock);  				if (ret != -EEXIST) {  					free_extent_map(em); @@ -1180,43 +1370,59 @@ out_check:  		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,  					       num_bytes, num_bytes, type); -		BUG_ON(ret); +		BUG_ON(ret); /* -ENOMEM */  		if (root->root_key.objectid ==  		    BTRFS_DATA_RELOC_TREE_OBJECTID) {  			ret = btrfs_reloc_clone_csums(inode, cur_offset,  						      num_bytes); -			BUG_ON(ret); +			if (ret) { +				if (!nolock && nocow) +					btrfs_end_nocow_write(root); +				goto error; +			}  		} -		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, -				cur_offset, cur_offset + num_bytes - 1, -				locked_page, EXTENT_CLEAR_UNLOCK_PAGE | -				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | -				EXTENT_SET_PRIVATE2); +		extent_clear_unlock_delalloc(inode, cur_offset, +					     cur_offset + num_bytes - 1, +					     locked_page, EXTENT_LOCKED | +					     EXTENT_DELALLOC, PAGE_UNLOCK | +					     PAGE_SET_PRIVATE2); +		if (!nolock && nocow) +			btrfs_end_nocow_write(root);  		cur_offset = extent_end;  		if (cur_offset > end)  			break;  	} -	btrfs_release_path(root, path); +	btrfs_release_path(path); -	if (cur_offset <= end && cow_start == (u64)-1) +	if (cur_offset <= end && cow_start == (u64)-1) {  		cow_start = cur_offset; +		cur_offset = end; +	} +  	if (cow_start != (u64)-1) {  		ret = cow_file_range(inode, locked_page, cow_start, end,  				     page_started, nr_written, 1); -		BUG_ON(ret); +		if (ret) +			goto error;  	} -	if (nolock) { -		ret = btrfs_end_transaction_nolock(trans, root); -		BUG_ON(ret); -	} else { -		ret = btrfs_end_transaction(trans, root); -		BUG_ON(ret); -	} +error: +	err = btrfs_end_transaction(trans, root); +	if (!ret) +		ret = err; + +	if (ret && cur_offset < end) +		extent_clear_unlock_delalloc(inode, cur_offset, end, +					     locked_page, EXTENT_LOCKED | +					     EXTENT_DELALLOC | EXTENT_DEFRAG | +					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | +					     PAGE_CLEAR_DIRTY | +					     PAGE_SET_WRITEBACK | +					     PAGE_END_WRITEBACK);  	btrfs_free_path(path); -	return 0; +	return ret;  }  /* @@ -1229,31 +1435,36 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,  	int ret;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {  		ret = run_delalloc_nocow(inode, locked_page, start, end,  					 page_started, 1, nr_written); -	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) +	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {  		ret = run_delalloc_nocow(inode, locked_page, start, end,  					 page_started, 0, nr_written); -	else if (!btrfs_test_opt(root, COMPRESS) && -		 !(BTRFS_I(inode)->force_compress)) +	} else if (!btrfs_test_opt(root, COMPRESS) && +		   !(BTRFS_I(inode)->force_compress) && +		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {  		ret = cow_file_range(inode, locked_page, start, end,  				      page_started, nr_written, 1); -	else +	} else { +		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +			&BTRFS_I(inode)->runtime_flags);  		ret = cow_file_range_async(inode, locked_page, start, end,  					   page_started, nr_written); +	}  	return ret;  } -static int btrfs_split_extent_hook(struct inode *inode, -				   struct extent_state *orig, u64 split) +static void btrfs_split_extent_hook(struct inode *inode, +				    struct extent_state *orig, u64 split)  {  	/* not delalloc, ignore it */  	if (!(orig->state & EXTENT_DELALLOC)) -		return 0; +		return; -	atomic_inc(&BTRFS_I(inode)->outstanding_extents); -	return 0; +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents++; +	spin_unlock(&BTRFS_I(inode)->lock);  }  /* @@ -1262,16 +1473,57 @@ static int btrfs_split_extent_hook(struct inode *inode,   * extents, such as when we are doing sequential writes, so we can properly   * account for the metadata space we'll need.   */ -static int btrfs_merge_extent_hook(struct inode *inode, -				   struct extent_state *new, -				   struct extent_state *other) +static void btrfs_merge_extent_hook(struct inode *inode, +				    struct extent_state *new, +				    struct extent_state *other)  {  	/* not delalloc, ignore it */  	if (!(other->state & EXTENT_DELALLOC)) -		return 0; +		return; -	atomic_dec(&BTRFS_I(inode)->outstanding_extents); -	return 0; +	spin_lock(&BTRFS_I(inode)->lock); +	BTRFS_I(inode)->outstanding_extents--; +	spin_unlock(&BTRFS_I(inode)->lock); +} + +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, +				      struct inode *inode) +{ +	spin_lock(&root->delalloc_lock); +	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +		list_add_tail(&BTRFS_I(inode)->delalloc_inodes, +			      &root->delalloc_inodes); +		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, +			&BTRFS_I(inode)->runtime_flags); +		root->nr_delalloc_inodes++; +		if (root->nr_delalloc_inodes == 1) { +			spin_lock(&root->fs_info->delalloc_root_lock); +			BUG_ON(!list_empty(&root->delalloc_root)); +			list_add_tail(&root->delalloc_root, +				      &root->fs_info->delalloc_roots); +			spin_unlock(&root->fs_info->delalloc_root_lock); +		} +	} +	spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, +				     struct inode *inode) +{ +	spin_lock(&root->delalloc_lock); +	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { +		list_del_init(&BTRFS_I(inode)->delalloc_inodes); +		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, +			  &BTRFS_I(inode)->runtime_flags); +		root->nr_delalloc_inodes--; +		if (!root->nr_delalloc_inodes) { +			spin_lock(&root->fs_info->delalloc_root_lock); +			BUG_ON(list_empty(&root->delalloc_root)); +			list_del_init(&root->delalloc_root); +			spin_unlock(&root->fs_info->delalloc_root_lock); +		} +	} +	spin_unlock(&root->delalloc_lock);  }  /* @@ -1279,91 +1531,99 @@ static int btrfs_merge_extent_hook(struct inode *inode,   * bytes in this file, and to maintain the list of inodes that   * have pending delalloc work to be done.   */ -static int btrfs_set_bit_hook(struct inode *inode, -			      struct extent_state *state, int *bits) +static void btrfs_set_bit_hook(struct inode *inode, +			       struct extent_state *state, unsigned long *bits)  {  	/*  	 * set_bit and clear bit hooks normally require _irqsave/restore -	 * but in this case, we are only testeing for the DELALLOC +	 * but in this case, we are only testing for the DELALLOC  	 * bit, which is only set or cleared with irqs on  	 */  	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {  		struct btrfs_root *root = BTRFS_I(inode)->root;  		u64 len = state->end + 1 - state->start; -		int do_list = (root->root_key.objectid != -			       BTRFS_ROOT_TREE_OBJECTID); +		bool do_list = !btrfs_is_free_space_inode(inode); -		if (*bits & EXTENT_FIRST_DELALLOC) +		if (*bits & EXTENT_FIRST_DELALLOC) {  			*bits &= ~EXTENT_FIRST_DELALLOC; -		else -			atomic_inc(&BTRFS_I(inode)->outstanding_extents); +		} else { +			spin_lock(&BTRFS_I(inode)->lock); +			BTRFS_I(inode)->outstanding_extents++; +			spin_unlock(&BTRFS_I(inode)->lock); +		} -		spin_lock(&root->fs_info->delalloc_lock); +		__percpu_counter_add(&root->fs_info->delalloc_bytes, len, +				     root->fs_info->delalloc_batch); +		spin_lock(&BTRFS_I(inode)->lock);  		BTRFS_I(inode)->delalloc_bytes += len; -		root->fs_info->delalloc_bytes += len; -		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { -			list_add_tail(&BTRFS_I(inode)->delalloc_inodes, -				      &root->fs_info->delalloc_inodes); -		} -		spin_unlock(&root->fs_info->delalloc_lock); +		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, +					 &BTRFS_I(inode)->runtime_flags)) +			btrfs_add_delalloc_inodes(root, inode); +		spin_unlock(&BTRFS_I(inode)->lock);  	} -	return 0;  }  /*   * extent_io.c clear_bit_hook, see set_bit_hook for why   */ -static int btrfs_clear_bit_hook(struct inode *inode, -				struct extent_state *state, int *bits) +static void btrfs_clear_bit_hook(struct inode *inode, +				 struct extent_state *state, +				 unsigned long *bits)  {  	/*  	 * set_bit and clear bit hooks normally require _irqsave/restore -	 * but in this case, we are only testeing for the DELALLOC +	 * but in this case, we are only testing for the DELALLOC  	 * bit, which is only set or cleared with irqs on  	 */  	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {  		struct btrfs_root *root = BTRFS_I(inode)->root;  		u64 len = state->end + 1 - state->start; -		int do_list = (root->root_key.objectid != -			       BTRFS_ROOT_TREE_OBJECTID); +		bool do_list = !btrfs_is_free_space_inode(inode); -		if (*bits & EXTENT_FIRST_DELALLOC) +		if (*bits & EXTENT_FIRST_DELALLOC) {  			*bits &= ~EXTENT_FIRST_DELALLOC; -		else if (!(*bits & EXTENT_DO_ACCOUNTING)) -			atomic_dec(&BTRFS_I(inode)->outstanding_extents); +		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) { +			spin_lock(&BTRFS_I(inode)->lock); +			BTRFS_I(inode)->outstanding_extents--; +			spin_unlock(&BTRFS_I(inode)->lock); +		} -		if (*bits & EXTENT_DO_ACCOUNTING) +		/* +		 * We don't reserve metadata space for space cache inodes so we +		 * don't need to call dellalloc_release_metadata if there is an +		 * error. +		 */ +		if (*bits & EXTENT_DO_ACCOUNTING && +		    root != root->fs_info->tree_root)  			btrfs_delalloc_release_metadata(inode, len);  		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID -		    && do_list) +		    && do_list && !(state->state & EXTENT_NORESERVE))  			btrfs_free_reserved_data_space(inode, len); -		spin_lock(&root->fs_info->delalloc_lock); -		root->fs_info->delalloc_bytes -= len; +		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len, +				     root->fs_info->delalloc_batch); +		spin_lock(&BTRFS_I(inode)->lock);  		BTRFS_I(inode)->delalloc_bytes -= len; -  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && -		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { -			list_del_init(&BTRFS_I(inode)->delalloc_inodes); -		} -		spin_unlock(&root->fs_info->delalloc_lock); +		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST, +			     &BTRFS_I(inode)->runtime_flags)) +			btrfs_del_delalloc_inode(root, inode); +		spin_unlock(&BTRFS_I(inode)->lock);  	} -	return 0;  }  /*   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure   * we don't create bios that span stripes or chunks   */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,  			 size_t size, struct bio *bio,  			 unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; -	struct btrfs_mapping_tree *map_tree; -	u64 logical = (u64)bio->bi_sector << 9; +	u64 logical = (u64)bio->bi_iter.bi_sector << 9;  	u64 length = 0;  	u64 map_length;  	int ret; @@ -1371,15 +1631,15 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  	if (bio_flags & EXTENT_BIO_COMPRESSED)  		return 0; -	length = bio->bi_size; -	map_tree = &root->fs_info->mapping_tree; +	length = bio->bi_iter.bi_size;  	map_length = length; -	ret = btrfs_map_block(map_tree, READ, logical, +	ret = btrfs_map_block(root->fs_info, rw, logical,  			      &map_length, NULL, 0); - +	/* Will always return 0 with map_multi == NULL */ +	BUG_ON(ret < 0);  	if (map_length < length + size)  		return 1; -	return ret; +	return 0;  }  /* @@ -1399,7 +1659,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,  	int ret = 0;  	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); -	BUG_ON(ret); +	BUG_ON(ret); /* -ENOMEM */  	return 0;  } @@ -1416,7 +1676,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,  			  u64 bio_offset)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	return btrfs_map_bio(root, rw, bio, mirror_num, 1); +	int ret; + +	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); +	if (ret) +		bio_endio(bio, ret); +	return ret;  }  /* @@ -1430,36 +1695,54 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret = 0;  	int skip_sum; +	int metadata = 0; +	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -	if (root == root->fs_info->tree_root) -		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); -	else -		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); -	BUG_ON(ret); +	if (btrfs_is_free_space_inode(inode)) +		metadata = 2;  	if (!(rw & REQ_WRITE)) { +		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); +		if (ret) +			goto out; +  		if (bio_flags & EXTENT_BIO_COMPRESSED) { -			return btrfs_submit_compressed_read(inode, bio, -						    mirror_num, bio_flags); -		} else if (!skip_sum) -			btrfs_lookup_bio_sums(root, inode, bio, NULL); +			ret = btrfs_submit_compressed_read(inode, bio, +							   mirror_num, +							   bio_flags); +			goto out; +		} else if (!skip_sum) { +			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); +			if (ret) +				goto out; +		}  		goto mapit; -	} else if (!skip_sum) { +	} else if (async && !skip_sum) {  		/* csum items have already been cloned */  		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)  			goto mapit;  		/* we're doing a write, do the async checksumming */ -		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, +		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,  				   inode, rw, bio, mirror_num,  				   bio_flags, bio_offset,  				   __btrfs_submit_bio_start,  				   __btrfs_submit_bio_done); +		goto out; +	} else if (!skip_sum) { +		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); +		if (ret) +			goto out;  	}  mapit: -	return btrfs_map_bio(root, rw, bio, mirror_num, 0); +	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: +	if (ret < 0) +		bio_endio(bio, ret); +	return ret;  }  /* @@ -1472,11 +1755,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,  {  	struct btrfs_ordered_sum *sum; -	btrfs_set_trans_block_group(trans, inode); -  	list_for_each_entry(sum, list, list) { +		trans->adding_csums = 1;  		btrfs_csum_file_blocks(trans,  		       BTRFS_I(inode)->root->fs_info->csum_root, sum); +		trans->adding_csums = 0;  	}  	return 0;  } @@ -1484,8 +1767,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,  			      struct extent_state **cached_state)  { -	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) -		WARN_ON(1); +	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);  	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,  				   cached_state, GFP_NOFS);  } @@ -1505,6 +1787,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)  	struct inode *inode;  	u64 page_start;  	u64 page_end; +	int ret;  	fixup = container_of(work, struct btrfs_writepage_fixup, work);  	page = fixup->page; @@ -1520,7 +1803,7 @@ again:  	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;  	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, -			 &cached_state, GFP_NOFS); +			 &cached_state);  	/* already ordered? We're done */  	if (PagePrivate2(page)) @@ -1532,18 +1815,28 @@ again:  				     page_end, &cached_state, GFP_NOFS);  		unlock_page(page);  		btrfs_start_ordered_extent(inode, ordered, 1); +		btrfs_put_ordered_extent(ordered);  		goto again;  	} -	BUG(); +	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	if (ret) { +		mapping_set_error(page->mapping, ret); +		end_extent_writepage(page, ret, page_start, page_end); +		ClearPageChecked(page); +		goto out; +	 } +  	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);  	ClearPageChecked(page); +	set_page_dirty(page);  out:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,  			     &cached_state, GFP_NOFS);  out_page:  	unlock_page(page);  	page_cache_release(page); +	kfree(fixup);  }  /* @@ -1576,10 +1869,10 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)  	SetPageChecked(page);  	page_cache_get(page); -	fixup->work.func = btrfs_writepage_fixup_worker; +	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);  	fixup->page = page; -	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); -	return -EAGAIN; +	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); +	return -EBUSY;  }  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -1594,13 +1887,12 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_key ins; -	u64 hint; +	int extent_inserted = 0;  	int ret;  	path = btrfs_alloc_path(); -	BUG_ON(!path); - -	path->leave_spinning = 1; +	if (!path) +		return -ENOMEM;  	/*  	 * we may be replacing one extent in the tree with another. @@ -1611,15 +1903,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	 * the caller is expected to unpin it and allow it to be merged  	 * with the others.  	 */ -	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, -				 &hint, 0); -	BUG_ON(ret); +	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, +				   file_pos + num_bytes, NULL, 0, +				   1, sizeof(*fi), &extent_inserted); +	if (ret) +		goto out; -	ins.objectid = inode->i_ino; -	ins.offset = file_pos; -	ins.type = BTRFS_EXTENT_DATA_KEY; -	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); -	BUG_ON(ret); +	if (!extent_inserted) { +		ins.objectid = btrfs_ino(inode); +		ins.offset = file_pos; +		ins.type = BTRFS_EXTENT_DATA_KEY; + +		path->leave_spinning = 1; +		ret = btrfs_insert_empty_item(trans, root, path, &ins, +					      sizeof(*fi)); +		if (ret) +			goto out; +	}  	leaf = path->nodes[0];  	fi = btrfs_item_ptr(leaf, path->slots[0],  			    struct btrfs_file_extent_item); @@ -1634,10 +1934,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	btrfs_set_file_extent_encryption(leaf, fi, encryption);  	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); -	btrfs_unlock_up_safe(path, 1); -	btrfs_set_lock_blocking(leaf); -  	btrfs_mark_buffer_dirty(leaf); +	btrfs_release_path(path);  	inode_add_bytes(inode, num_bytes); @@ -1646,112 +1944,849 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	ins.type = BTRFS_EXTENT_ITEM_KEY;  	ret = btrfs_alloc_reserved_file_extent(trans, root,  					root->root_key.objectid, -					inode->i_ino, file_pos, &ins); -	BUG_ON(ret); +					btrfs_ino(inode), file_pos, &ins); +out:  	btrfs_free_path(path); +	return ret; +} + +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { +	struct rb_node node; +	struct old_sa_defrag_extent *old; +	u64 root_id; +	u64 inum; +	u64 file_pos; +	u64 extent_offset; +	u64 num_bytes; +	u64 generation; +}; + +struct old_sa_defrag_extent { +	struct list_head list; +	struct new_sa_defrag_extent *new; + +	u64 extent_offset; +	u64 bytenr; +	u64 offset; +	u64 len; +	int count; +}; + +struct new_sa_defrag_extent { +	struct rb_root root; +	struct list_head head; +	struct btrfs_path *path; +	struct inode *inode; +	u64 file_pos; +	u64 len; +	u64 bytenr; +	u64 disk_len; +	u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, +			struct sa_defrag_extent_backref *b2) +{ +	if (b1->root_id < b2->root_id) +		return -1; +	else if (b1->root_id > b2->root_id) +		return 1; + +	if (b1->inum < b2->inum) +		return -1; +	else if (b1->inum > b2->inum) +		return 1; + +	if (b1->file_pos < b2->file_pos) +		return -1; +	else if (b1->file_pos > b2->file_pos) +		return 1; + +	/* +	 * [------------------------------] ===> (a range of space) +	 *     |<--->|   |<---->| =============> (fs/file tree A) +	 * |<---------------------------->| ===> (fs/file tree B) +	 * +	 * A range of space can refer to two file extents in one tree while +	 * refer to only one file extent in another tree. +	 * +	 * So we may process a disk offset more than one time(two extents in A) +	 * and locate at the same extent(one extent in B), then insert two same +	 * backrefs(both refer to the extent in B). +	 */  	return 0;  } +static void backref_insert(struct rb_root *root, +			   struct sa_defrag_extent_backref *backref) +{ +	struct rb_node **p = &root->rb_node; +	struct rb_node *parent = NULL; +	struct sa_defrag_extent_backref *entry; +	int ret; + +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + +		ret = backref_comp(backref, entry); +		if (ret < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&backref->node, parent, p); +	rb_insert_color(&backref->node, root); +} +  /* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction.  It limits the amount of btree - * reads required while inside the transaction. + * Note the backref might has changed, and in this case we just return 0.   */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, +				       void *ctx) +{ +	struct btrfs_file_extent_item *extent; +	struct btrfs_fs_info *fs_info; +	struct old_sa_defrag_extent *old = ctx; +	struct new_sa_defrag_extent *new = old->new; +	struct btrfs_path *path = new->path; +	struct btrfs_key key; +	struct btrfs_root *root; +	struct sa_defrag_extent_backref *backref; +	struct extent_buffer *leaf; +	struct inode *inode = new->inode; +	int slot; +	int ret; +	u64 extent_offset; +	u64 num_bytes; + +	if (BTRFS_I(inode)->root->root_key.objectid == root_id && +	    inum == btrfs_ino(inode)) +		return 0; + +	key.objectid = root_id; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; + +	fs_info = BTRFS_I(inode)->root->fs_info; +	root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(root)) { +		if (PTR_ERR(root) == -ENOENT) +			return 0; +		WARN_ON(1); +		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", +			 inum, offset, root_id); +		return PTR_ERR(root); +	} + +	key.objectid = inum; +	key.type = BTRFS_EXTENT_DATA_KEY; +	if (offset > (u64)-1 << 32) +		key.offset = 0; +	else +		key.offset = offset; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = 0; + +	while (1) { +		cond_resched(); + +		leaf = path->nodes[0]; +		slot = path->slots[0]; + +		if (slot >= btrfs_header_nritems(leaf)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) { +				goto out; +			} else if (ret > 0) { +				ret = 0; +				goto out; +			} +			continue; +		} + +		path->slots[0]++; + +		btrfs_item_key_to_cpu(leaf, &key, slot); + +		if (key.objectid > inum) +			goto out; + +		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) +			continue; + +		extent = btrfs_item_ptr(leaf, slot, +					struct btrfs_file_extent_item); + +		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) +			continue; + +		/* +		 * 'offset' refers to the exact key.offset, +		 * NOT the 'offset' field in btrfs_extent_data_ref, ie. +		 * (key.offset - extent_offset). +		 */ +		if (key.offset != offset) +			continue; + +		extent_offset = btrfs_file_extent_offset(leaf, extent); +		num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + +		if (extent_offset >= old->extent_offset + old->offset + +		    old->len || extent_offset + num_bytes <= +		    old->extent_offset + old->offset) +			continue; +		break; +	} + +	backref = kmalloc(sizeof(*backref), GFP_NOFS); +	if (!backref) { +		ret = -ENOENT; +		goto out; +	} + +	backref->root_id = root_id; +	backref->inum = inum; +	backref->file_pos = offset; +	backref->num_bytes = num_bytes; +	backref->extent_offset = extent_offset; +	backref->generation = btrfs_file_extent_generation(leaf, extent); +	backref->old = old; +	backref_insert(&new->root, backref); +	old->count++; +out: +	btrfs_release_path(path); +	WARN_ON(ret); +	return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, +				   struct new_sa_defrag_extent *new) +{ +	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; +	struct old_sa_defrag_extent *old, *tmp; +	int ret; + +	new->path = path; + +	list_for_each_entry_safe(old, tmp, &new->head, list) { +		ret = iterate_inodes_from_logical(old->bytenr + +						  old->extent_offset, fs_info, +						  path, record_one_backref, +						  old); +		if (ret < 0 && ret != -ENOENT) +			return false; + +		/* no backref to be processed for this extent */ +		if (!old->count) { +			list_del(&old->list); +			kfree(old); +		} +	} + +	if (list_empty(&new->head)) +		return false; + +	return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, +			      struct btrfs_file_extent_item *fi, +			      struct new_sa_defrag_extent *new) +{ +	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) +		return 0; + +	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) +		return 0; + +	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) +		return 0; + +	if (btrfs_file_extent_encryption(leaf, fi) || +	    btrfs_file_extent_other_encoding(leaf, fi)) +		return 0; + +	return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, +				 struct sa_defrag_extent_backref *prev, +				 struct sa_defrag_extent_backref *backref) +{ +	struct btrfs_file_extent_item *extent; +	struct btrfs_file_extent_item *item; +	struct btrfs_ordered_extent *ordered; +	struct btrfs_trans_handle *trans; +	struct btrfs_fs_info *fs_info; +	struct btrfs_root *root; +	struct btrfs_key key; +	struct extent_buffer *leaf; +	struct old_sa_defrag_extent *old = backref->old; +	struct new_sa_defrag_extent *new = old->new; +	struct inode *src_inode = new->inode; +	struct inode *inode; +	struct extent_state *cached = NULL; +	int ret = 0; +	u64 start; +	u64 len; +	u64 lock_start; +	u64 lock_end; +	bool merge = false; +	int index; + +	if (prev && prev->root_id == backref->root_id && +	    prev->inum == backref->inum && +	    prev->file_pos + prev->num_bytes == backref->file_pos) +		merge = true; + +	/* step 1: get root */ +	key.objectid = backref->root_id; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; + +	fs_info = BTRFS_I(src_inode)->root->fs_info; +	index = srcu_read_lock(&fs_info->subvol_srcu); + +	root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(root)) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		if (PTR_ERR(root) == -ENOENT) +			return 0; +		return PTR_ERR(root); +	} + +	if (btrfs_root_readonly(root)) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		return 0; +	} + +	/* step 2: get inode */ +	key.objectid = backref->inum; +	key.type = BTRFS_INODE_ITEM_KEY; +	key.offset = 0; + +	inode = btrfs_iget(fs_info->sb, &key, root, NULL); +	if (IS_ERR(inode)) { +		srcu_read_unlock(&fs_info->subvol_srcu, index); +		return 0; +	} + +	srcu_read_unlock(&fs_info->subvol_srcu, index); + +	/* step 3: relink backref */ +	lock_start = backref->file_pos; +	lock_end = backref->file_pos + backref->num_bytes - 1; +	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, +			 0, &cached); + +	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); +	if (ordered) { +		btrfs_put_ordered_extent(ordered); +		goto out_unlock; +	} + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		goto out_unlock; +	} + +	key.objectid = backref->inum; +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = backref->file_pos; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) { +		goto out_free_path; +	} else if (ret > 0) { +		ret = 0; +		goto out_free_path; +	} + +	extent = btrfs_item_ptr(path->nodes[0], path->slots[0], +				struct btrfs_file_extent_item); + +	if (btrfs_file_extent_generation(path->nodes[0], extent) != +	    backref->generation) +		goto out_free_path; + +	btrfs_release_path(path); + +	start = backref->file_pos; +	if (backref->extent_offset < old->extent_offset + old->offset) +		start += old->extent_offset + old->offset - +			 backref->extent_offset; + +	len = min(backref->extent_offset + backref->num_bytes, +		  old->extent_offset + old->offset + old->len); +	len -= max(backref->extent_offset, old->extent_offset + old->offset); + +	ret = btrfs_drop_extents(trans, root, inode, start, +				 start + len, 1); +	if (ret) +		goto out_free_path; +again: +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = start; + +	path->leave_spinning = 1; +	if (merge) { +		struct btrfs_file_extent_item *fi; +		u64 extent_len; +		struct btrfs_key found_key; + +		ret = btrfs_search_slot(trans, root, &key, path, 0, 1); +		if (ret < 0) +			goto out_free_path; + +		path->slots[0]--; +		leaf = path->nodes[0]; +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + +		fi = btrfs_item_ptr(leaf, path->slots[0], +				    struct btrfs_file_extent_item); +		extent_len = btrfs_file_extent_num_bytes(leaf, fi); + +		if (extent_len + found_key.offset == start && +		    relink_is_mergable(leaf, fi, new)) { +			btrfs_set_file_extent_num_bytes(leaf, fi, +							extent_len + len); +			btrfs_mark_buffer_dirty(leaf); +			inode_add_bytes(inode, len); + +			ret = 1; +			goto out_free_path; +		} else { +			merge = false; +			btrfs_release_path(path); +			goto again; +		} +	} + +	ret = btrfs_insert_empty_item(trans, root, path, &key, +					sizeof(*extent)); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_free_path; +	} + +	leaf = path->nodes[0]; +	item = btrfs_item_ptr(leaf, path->slots[0], +				struct btrfs_file_extent_item); +	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); +	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); +	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); +	btrfs_set_file_extent_num_bytes(leaf, item, len); +	btrfs_set_file_extent_ram_bytes(leaf, item, new->len); +	btrfs_set_file_extent_generation(leaf, item, trans->transid); +	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); +	btrfs_set_file_extent_compression(leaf, item, new->compress_type); +	btrfs_set_file_extent_encryption(leaf, item, 0); +	btrfs_set_file_extent_other_encoding(leaf, item, 0); + +	btrfs_mark_buffer_dirty(leaf); +	inode_add_bytes(inode, len); +	btrfs_release_path(path); + +	ret = btrfs_inc_extent_ref(trans, root, new->bytenr, +			new->disk_len, 0, +			backref->root_id, backref->inum, +			new->file_pos, 0);	/* start - extent_offset */ +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_free_path; +	} + +	ret = 1; +out_free_path: +	btrfs_release_path(path); +	path->leave_spinning = 0; +	btrfs_end_transaction(trans, root); +out_unlock: +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, +			     &cached, GFP_NOFS); +	iput(inode); +	return ret; +} + +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ +	struct old_sa_defrag_extent *old, *tmp; + +	if (!new) +		return; + +	list_for_each_entry_safe(old, tmp, &new->head, list) { +		list_del(&old->list); +		kfree(old); +	} +	kfree(new); +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ +	struct btrfs_path *path; +	struct sa_defrag_extent_backref *backref; +	struct sa_defrag_extent_backref *prev = NULL; +	struct inode *inode; +	struct btrfs_root *root; +	struct rb_node *node; +	int ret; + +	inode = new->inode; +	root = BTRFS_I(inode)->root; + +	path = btrfs_alloc_path(); +	if (!path) +		return; + +	if (!record_extent_backrefs(path, new)) { +		btrfs_free_path(path); +		goto out; +	} +	btrfs_release_path(path); + +	while (1) { +		node = rb_first(&new->root); +		if (!node) +			break; +		rb_erase(node, &new->root); + +		backref = rb_entry(node, struct sa_defrag_extent_backref, node); + +		ret = relink_extent_backref(path, prev, backref); +		WARN_ON(ret < 0); + +		kfree(prev); + +		if (ret == 1) +			prev = backref; +		else +			prev = NULL; +		cond_resched(); +	} +	kfree(prev); + +	btrfs_free_path(path); +out: +	free_sa_defrag_extent(new); + +	atomic_dec(&root->fs_info->defrag_running); +	wake_up(&root->fs_info->transaction_wait); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, +			struct btrfs_ordered_extent *ordered) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_path *path; +	struct btrfs_key key; +	struct old_sa_defrag_extent *old; +	struct new_sa_defrag_extent *new; +	int ret; + +	new = kmalloc(sizeof(*new), GFP_NOFS); +	if (!new) +		return NULL; + +	new->inode = inode; +	new->file_pos = ordered->file_offset; +	new->len = ordered->len; +	new->bytenr = ordered->start; +	new->disk_len = ordered->disk_len; +	new->compress_type = ordered->compress_type; +	new->root = RB_ROOT; +	INIT_LIST_HEAD(&new->head); + +	path = btrfs_alloc_path(); +	if (!path) +		goto out_kfree; + +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = new->file_pos; + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out_free_path; +	if (ret > 0 && path->slots[0] > 0) +		path->slots[0]--; + +	/* find out all the old extents for the file range */ +	while (1) { +		struct btrfs_file_extent_item *extent; +		struct extent_buffer *l; +		int slot; +		u64 num_bytes; +		u64 offset; +		u64 end; +		u64 disk_bytenr; +		u64 extent_offset; + +		l = path->nodes[0]; +		slot = path->slots[0]; + +		if (slot >= btrfs_header_nritems(l)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				goto out_free_path; +			else if (ret > 0) +				break; +			continue; +		} + +		btrfs_item_key_to_cpu(l, &key, slot); + +		if (key.objectid != btrfs_ino(inode)) +			break; +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			break; +		if (key.offset >= new->file_pos + new->len) +			break; + +		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + +		num_bytes = btrfs_file_extent_num_bytes(l, extent); +		if (key.offset + num_bytes < new->file_pos) +			goto next; + +		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); +		if (!disk_bytenr) +			goto next; + +		extent_offset = btrfs_file_extent_offset(l, extent); + +		old = kmalloc(sizeof(*old), GFP_NOFS); +		if (!old) +			goto out_free_path; + +		offset = max(new->file_pos, key.offset); +		end = min(new->file_pos + new->len, key.offset + num_bytes); + +		old->bytenr = disk_bytenr; +		old->extent_offset = extent_offset; +		old->offset = offset - key.offset; +		old->len = end - offset; +		old->new = new; +		old->count = 0; +		list_add_tail(&old->list, &new->head); +next: +		path->slots[0]++; +		cond_resched(); +	} + +	btrfs_free_path(path); +	atomic_inc(&root->fs_info->defrag_running); + +	return new; + +out_free_path: +	btrfs_free_path(path); +out_kfree: +	free_sa_defrag_extent(new); +	return NULL; +} + +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, +					 u64 start, u64 len) +{ +	struct btrfs_block_group_cache *cache; + +	cache = btrfs_lookup_block_group(root->fs_info, start); +	ASSERT(cache); + +	spin_lock(&cache->lock); +	cache->delalloc_bytes -= len; +	spin_unlock(&cache->lock); + +	btrfs_put_block_group(cache); +} +  /* as ordered data IO finishes, this gets called so we can finish   * an ordered extent if the range of bytes in the file it covers are   * fully written.   */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  { +	struct inode *inode = ordered_extent->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans = NULL; -	struct btrfs_ordered_extent *ordered_extent = NULL;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_state *cached_state = NULL; -	int compressed = 0; -	int ret; -	bool nolock = false; +	struct new_sa_defrag_extent *new = NULL; +	int compress_type = 0; +	int ret = 0; +	u64 logical_len = ordered_extent->len; +	bool nolock; +	bool truncated = false; -	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, -					     end - start + 1); -	if (!ret) -		return 0; -	BUG_ON(!ordered_extent); +	nolock = btrfs_is_free_space_inode(inode); + +	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { +		ret = -EIO; +		goto out; +	} -	nolock = (root == root->fs_info->tree_root); +	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { +		truncated = true; +		logical_len = ordered_extent->truncated_len; +		/* Truncated the entire extent, don't bother adding */ +		if (!logical_len) +			goto out; +	}  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { -		BUG_ON(!list_empty(&ordered_extent->list)); -		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); -		if (!ret) { -			if (nolock) -				trans = btrfs_join_transaction_nolock(root, 1); -			else -				trans = btrfs_join_transaction(root, 1); -			BUG_ON(!trans); -			btrfs_set_trans_block_group(trans, inode); -			trans->block_rsv = &root->fs_info->delalloc_block_rsv; -			ret = btrfs_update_inode(trans, root, inode); -			BUG_ON(ret); +		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ +		btrfs_ordered_update_i_size(inode, 0, ordered_extent); +		if (nolock) +			trans = btrfs_join_transaction_nolock(root); +		else +			trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			trans = NULL; +			goto out;  		} +		trans->block_rsv = &root->fs_info->delalloc_block_rsv; +		ret = btrfs_update_inode_fallback(trans, root, inode); +		if (ret) /* -ENOMEM or corruption */ +			btrfs_abort_transaction(trans, root, ret);  		goto out;  	}  	lock_extent_bits(io_tree, ordered_extent->file_offset,  			 ordered_extent->file_offset + ordered_extent->len - 1, -			 0, &cached_state, GFP_NOFS); +			 0, &cached_state); + +	ret = test_range_bit(io_tree, ordered_extent->file_offset, +			ordered_extent->file_offset + ordered_extent->len - 1, +			EXTENT_DEFRAG, 1, cached_state); +	if (ret) { +		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); +		if (0 && last_snapshot >= BTRFS_I(inode)->generation) +			/* the inode is shared */ +			new = record_old_file_extents(inode, ordered_extent); + +		clear_extent_bit(io_tree, ordered_extent->file_offset, +			ordered_extent->file_offset + ordered_extent->len - 1, +			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); +	}  	if (nolock) -		trans = btrfs_join_transaction_nolock(root, 1); +		trans = btrfs_join_transaction_nolock(root);  	else -		trans = btrfs_join_transaction(root, 1); -	btrfs_set_trans_block_group(trans, inode); +		trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		trans = NULL; +		goto out_unlock; +	} +  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) -		compressed = 1; +		compress_type = ordered_extent->compress_type;  	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { -		BUG_ON(compressed); +		BUG_ON(compress_type);  		ret = btrfs_mark_extent_written(trans, inode,  						ordered_extent->file_offset,  						ordered_extent->file_offset + -						ordered_extent->len); -		BUG_ON(ret); +						logical_len);  	} else {  		BUG_ON(root == root->fs_info->tree_root);  		ret = insert_reserved_file_extent(trans, inode,  						ordered_extent->file_offset,  						ordered_extent->start,  						ordered_extent->disk_len, -						ordered_extent->len, -						ordered_extent->len, -						compressed, 0, 0, +						logical_len, logical_len, +						compress_type, 0, 0,  						BTRFS_FILE_EXTENT_REG); -		unpin_extent_cache(&BTRFS_I(inode)->extent_tree, -				   ordered_extent->file_offset, -				   ordered_extent->len); -		BUG_ON(ret); +		if (!ret) +			btrfs_release_delalloc_bytes(root, +						     ordered_extent->start, +						     ordered_extent->disk_len); +	} +	unpin_extent_cache(&BTRFS_I(inode)->extent_tree, +			   ordered_extent->file_offset, ordered_extent->len, +			   trans->transid); +	if (ret < 0) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_unlock;  	} -	unlock_extent_cached(io_tree, ordered_extent->file_offset, -			     ordered_extent->file_offset + -			     ordered_extent->len - 1, &cached_state, GFP_NOFS);  	add_pending_csums(trans, inode, ordered_extent->file_offset,  			  &ordered_extent->list);  	btrfs_ordered_update_i_size(inode, 0, ordered_extent); -	ret = btrfs_update_inode(trans, root, inode); -	BUG_ON(ret); +	ret = btrfs_update_inode_fallback(trans, root, inode); +	if (ret) { /* -ENOMEM or corruption */ +		btrfs_abort_transaction(trans, root, ret); +		goto out_unlock; +	} +	ret = 0; +out_unlock: +	unlock_extent_cached(io_tree, ordered_extent->file_offset, +			     ordered_extent->file_offset + +			     ordered_extent->len - 1, &cached_state, GFP_NOFS);  out: -	if (nolock) { -		if (trans) -			btrfs_end_transaction_nolock(trans, root); -	} else { +	if (root != root->fs_info->tree_root)  		btrfs_delalloc_release_metadata(inode, ordered_extent->len); -		if (trans) -			btrfs_end_transaction(trans, root); +	if (trans) +		btrfs_end_transaction(trans, root); + +	if (ret || truncated) { +		u64 start, end; + +		if (truncated) +			start = ordered_extent->file_offset + logical_len; +		else +			start = ordered_extent->file_offset; +		end = ordered_extent->file_offset + ordered_extent->len - 1; +		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + +		/* Drop the cache for the part of the extent we didn't write. */ +		btrfs_drop_extent_cache(inode, start, end, 0); + +		/* +		 * If the ordered extent had an IOERR or something else went +		 * wrong we need to return the space for this ordered extent +		 * back to the allocator.  We only free the extent in the +		 * truncated case if we didn't write out the extent at all. +		 */ +		if ((ret || !logical_len) && +		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && +		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) +			btrfs_free_reserved_extent(root, ordered_extent->start, +						   ordered_extent->disk_len, 1); +	} + + +	/* +	 * This needs to be done to make sure anybody waiting knows we are done +	 * updating everything for this ordered extent. +	 */ +	btrfs_remove_ordered_extent(inode, ordered_extent); + +	/* for snapshot-aware defrag */ +	if (new) { +		if (ret) { +			free_sa_defrag_extent(new); +			atomic_dec(&root->fs_info->defrag_running); +		} else { +			relink_file_extents(new); +		}  	}  	/* once for us */ @@ -1759,174 +2794,60 @@ out:  	/* once for the tree */  	btrfs_put_ordered_extent(ordered_extent); -	return 0; +	return ret;  } -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, -				struct extent_state *state, int uptodate) +static void finish_ordered_fn(struct btrfs_work *work)  { -	ClearPagePrivate2(page); -	return btrfs_finish_ordered_io(page->mapping->host, start, end); +	struct btrfs_ordered_extent *ordered_extent; +	ordered_extent = container_of(work, struct btrfs_ordered_extent, work); +	btrfs_finish_ordered_io(ordered_extent);  } -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data.  This - * io_failure_record is used to record state as we go through all the - * mirrors.  If another mirror has good data, the page is set up to date - * and things continue.  If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { -	struct page *page; -	u64 start; -	u64 len; -	u64 logical; -	unsigned long bio_flags; -	int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, -			 struct page *page, u64 start, u64 end, -			 struct extent_state *state) +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, +				struct extent_state *state, int uptodate)  { -	struct io_failure_record *failrec = NULL; -	u64 private; -	struct extent_map *em;  	struct inode *inode = page->mapping->host; -	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	struct bio *bio; -	int num_copies; -	int ret; -	int rw; -	u64 logical; - -	ret = get_state_private(failure_tree, start, &private); -	if (ret) { -		failrec = kmalloc(sizeof(*failrec), GFP_NOFS); -		if (!failrec) -			return -ENOMEM; -		failrec->start = start; -		failrec->len = end - start + 1; -		failrec->last_mirror = 0; -		failrec->bio_flags = 0; - -		read_lock(&em_tree->lock); -		em = lookup_extent_mapping(em_tree, start, failrec->len); -		if (em->start > start || em->start + em->len < start) { -			free_extent_map(em); -			em = NULL; -		} -		read_unlock(&em_tree->lock); +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_ordered_extent *ordered_extent = NULL; +	struct btrfs_workqueue *workers; -		if (!em || IS_ERR(em)) { -			kfree(failrec); -			return -EIO; -		} -		logical = start - em->start; -		logical = em->block_start + logical; -		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { -			logical = em->block_start; -			failrec->bio_flags = EXTENT_BIO_COMPRESSED; -		} -		failrec->logical = logical; -		free_extent_map(em); -		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | -				EXTENT_DIRTY, GFP_NOFS); -		set_state_private(failure_tree, start, -				 (u64)(unsigned long)failrec); -	} else { -		failrec = (struct io_failure_record *)(unsigned long)private; -	} -	num_copies = btrfs_num_copies( -			      &BTRFS_I(inode)->root->fs_info->mapping_tree, -			      failrec->logical, failrec->len); -	failrec->last_mirror++; -	if (!state) { -		spin_lock(&BTRFS_I(inode)->io_tree.lock); -		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, -						    failrec->start, -						    EXTENT_LOCKED); -		if (state && state->start != failrec->start) -			state = NULL; -		spin_unlock(&BTRFS_I(inode)->io_tree.lock); -	} -	if (!state || failrec->last_mirror > num_copies) { -		set_state_private(failure_tree, failrec->start, 0); -		clear_extent_bits(failure_tree, failrec->start, -				  failrec->start + failrec->len - 1, -				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); -		kfree(failrec); -		return -EIO; -	} -	bio = bio_alloc(GFP_NOFS, 1); -	bio->bi_private = state; -	bio->bi_end_io = failed_bio->bi_end_io; -	bio->bi_sector = failrec->logical >> 9; -	bio->bi_bdev = failed_bio->bi_bdev; -	bio->bi_size = 0; +	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); -	bio_add_page(bio, page, failrec->len, start - page_offset(page)); -	if (failed_bio->bi_rw & REQ_WRITE) -		rw = WRITE; -	else -		rw = READ; +	ClearPagePrivate2(page); +	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, +					    end - start + 1, uptodate)) +		return 0; -	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, -						      failrec->last_mirror, -						      failrec->bio_flags, 0); -	return 0; -} +	btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ -	u64 private; -	u64 private_failure; -	struct io_failure_record *failure; -	int ret; +	if (btrfs_is_free_space_inode(inode)) +		workers = root->fs_info->endio_freespace_worker; +	else +		workers = root->fs_info->endio_write_workers; +	btrfs_queue_work(workers, &ordered_extent->work); -	private = 0; -	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, -			     (u64)-1, 1, EXTENT_DIRTY)) { -		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, -					start, &private_failure); -		if (ret == 0) { -			failure = (struct io_failure_record *)(unsigned long) -				   private_failure; -			set_state_private(&BTRFS_I(inode)->io_failure_tree, -					  failure->start, 0); -			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, -					  failure->start, -					  failure->start + failure->len - 1, -					  EXTENT_DIRTY | EXTENT_LOCKED, -					  GFP_NOFS); -			kfree(failure); -		} -	}  	return 0;  }  /*   * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish.  If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish.  If not, the code in + * extent_io.c will try to find good copies for us.   */ -static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, -			       struct extent_state *state) +static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, +				      u64 phy_offset, struct page *page, +				      u64 start, u64 end, int mirror)  { -	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); +	size_t offset = start - page_offset(page);  	struct inode *inode = page->mapping->host;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	char *kaddr; -	u64 private = ~(u32)0; -	int ret;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	u32 csum_expected;  	u32 csum = ~(u32)0; +	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, +	                              DEFAULT_RATELIMIT_BURST);  	if (PageChecked(page)) {  		ClearPageChecked(page); @@ -1934,7 +2855,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	}  	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) -		return 0; +		goto good;  	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&  	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -1943,40 +2864,27 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  		return 0;  	} -	if (state && state->start == start) { -		private = state->private; -		ret = 0; -	} else { -		ret = get_state_private(io_tree, start, &private); -	} -	kaddr = kmap_atomic(page, KM_USER0); -	if (ret) -		goto zeroit; +	phy_offset >>= inode->i_sb->s_blocksize_bits; +	csum_expected = *(((u32 *)io_bio->csum) + phy_offset); -	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1); +	kaddr = kmap_atomic(page); +	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);  	btrfs_csum_final(csum, (char *)&csum); -	if (csum != private) +	if (csum != csum_expected)  		goto zeroit; -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  good: -	/* if the io failure tree for this inode is non-empty, -	 * check to see if we've recovered from a failed IO -	 */ -	btrfs_clean_io_failures(inode, start);  	return 0;  zeroit: -	if (printk_ratelimit()) { -		printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " -		       "private %llu\n", page->mapping->host->i_ino, -		       (unsigned long long)start, csum, -		       (unsigned long long)private); -	} +	if (__ratelimit(&_rs)) +		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", +			btrfs_ino(page->mapping->host), start, csum, csum_expected);  	memset(kaddr + offset, 1, end - start + 1);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); -	if (private == 0) +	kunmap_atomic(kaddr); +	if (csum_expected == 0)  		return 0;  	return -EIO;  } @@ -1986,6 +2894,8 @@ struct delayed_iput {  	struct inode *inode;  }; +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */  void btrfs_add_delayed_iput(struct inode *inode)  {  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2015,7 +2925,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  	if (empty)  		return; -	down_read(&root->fs_info->cleanup_work_sem);  	spin_lock(&fs_info->delayed_iput_lock);  	list_splice_init(&fs_info->delayed_iputs, &list);  	spin_unlock(&fs_info->delayed_iput_lock); @@ -2026,123 +2935,52 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)  		iput(delayed->inode);  		kfree(delayed);  	} -	up_read(&root->fs_info->cleanup_work_sem);  }  /* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. + * This is called in transaction commit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure.   */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending, -				u64 *bytes_to_reserve) +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, +			      struct btrfs_root *root)  { -	struct btrfs_root *root;  	struct btrfs_block_rsv *block_rsv; -	u64 num_bytes; -	int index; +	int ret; -	root = pending->root; -	if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) +	if (atomic_read(&root->orphan_inodes) || +	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)  		return; -	block_rsv = root->orphan_block_rsv; - -	/* orphan block reservation for the snapshot */ -	num_bytes = block_rsv->size; - -	/* -	 * after the snapshot is created, COWing tree blocks may use more -	 * space than it frees. So we should make sure there is enough -	 * reserved space. -	 */ -	index = trans->transid & 0x1; -	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { -		num_bytes += block_rsv->size - -			     (block_rsv->reserved + block_rsv->freed[index]); +	spin_lock(&root->orphan_lock); +	if (atomic_read(&root->orphan_inodes)) { +		spin_unlock(&root->orphan_lock); +		return;  	} -	*bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, -				struct btrfs_pending_snapshot *pending) -{ -	struct btrfs_root *root = pending->root; -	struct btrfs_root *snap = pending->snap; -	struct btrfs_block_rsv *block_rsv; -	u64 num_bytes; -	int index; -	int ret; - -	if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) +	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { +		spin_unlock(&root->orphan_lock);  		return; - -	/* refill source subvolume's orphan block reservation */ -	block_rsv = root->orphan_block_rsv; -	index = trans->transid & 0x1; -	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { -		num_bytes = block_rsv->size - -			    (block_rsv->reserved + block_rsv->freed[index]); -		ret = btrfs_block_rsv_migrate(&pending->block_rsv, -					      root->orphan_block_rsv, -					      num_bytes); -		BUG_ON(ret);  	} -	/* setup orphan block reservation for the snapshot */ -	block_rsv = btrfs_alloc_block_rsv(snap); -	BUG_ON(!block_rsv); - -	btrfs_add_durable_block_rsv(root->fs_info, block_rsv); -	snap->orphan_block_rsv = block_rsv; - -	num_bytes = root->orphan_block_rsv->size; -	ret = btrfs_block_rsv_migrate(&pending->block_rsv, -				      block_rsv, num_bytes); -	BUG_ON(ret); - -#if 0 -	/* insert orphan item for the snapshot */ -	WARN_ON(!root->orphan_item_inserted); -	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, -				       snap->root_key.objectid); -	BUG_ON(ret); -	snap->orphan_item_inserted = 1; -#endif -} - -enum btrfs_orphan_cleanup_state { -	ORPHAN_CLEANUP_STARTED	= 1, -	ORPHAN_CLEANUP_DONE	= 2, -}; - -/* - * This is called in transaction commmit time. If there are no orphan - * files in the subvolume, it removes orphan item and frees block_rsv - * structure. - */ -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, -			      struct btrfs_root *root) -{ -	int ret; - -	if (!list_empty(&root->orphan_list) || -	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) -		return; +	block_rsv = root->orphan_block_rsv; +	root->orphan_block_rsv = NULL; +	spin_unlock(&root->orphan_lock); -	if (root->orphan_item_inserted && +	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&  	    btrfs_root_refs(&root->root_item) > 0) {  		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,  					    root->root_key.objectid); -		BUG_ON(ret); -		root->orphan_item_inserted = 0; +		if (ret) +			btrfs_abort_transaction(trans, root, ret); +		else +			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, +				  &root->state);  	} -	if (root->orphan_block_rsv) { -		WARN_ON(root->orphan_block_rsv->size > 0); -		btrfs_free_block_rsv(root, root->orphan_block_rsv); -		root->orphan_block_rsv = NULL; +	if (block_rsv) { +		WARN_ON(block_rsv->size > 0); +		btrfs_free_block_rsv(root, block_rsv);  	}  } @@ -2162,8 +3000,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  	int ret;  	if (!root->orphan_block_rsv) { -		block_rsv = btrfs_alloc_block_rsv(root); -		BUG_ON(!block_rsv); +		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); +		if (!block_rsv) +			return -ENOMEM;  	}  	spin_lock(&root->orphan_lock); @@ -2174,8 +3013,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  		block_rsv = NULL;  	} -	if (list_empty(&BTRFS_I(inode)->i_orphan)) { -		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			      &BTRFS_I(inode)->runtime_flags)) {  #if 0  		/*  		 * For proper ENOSPC handling, we should do orphan @@ -2188,36 +3027,48 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)  			insert = 1;  #endif  		insert = 1; -	} else { -		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); +		atomic_inc(&root->orphan_inodes);  	} -	if (!BTRFS_I(inode)->orphan_meta_reserved) { -		BTRFS_I(inode)->orphan_meta_reserved = 1; +	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +			      &BTRFS_I(inode)->runtime_flags))  		reserve = 1; -	}  	spin_unlock(&root->orphan_lock); -	if (block_rsv) -		btrfs_add_durable_block_rsv(root->fs_info, block_rsv); -  	/* grab metadata reservation from transaction handle */  	if (reserve) {  		ret = btrfs_orphan_reserve_metadata(trans, inode); -		BUG_ON(ret); +		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */  	}  	/* insert an orphan item to track this unlinked/truncated file */  	if (insert >= 1) { -		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); -		BUG_ON(ret); +		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); +		if (ret) { +			atomic_dec(&root->orphan_inodes); +			if (reserve) { +				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +					  &BTRFS_I(inode)->runtime_flags); +				btrfs_orphan_release_metadata(inode); +			} +			if (ret != -EEXIST) { +				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +					  &BTRFS_I(inode)->runtime_flags); +				btrfs_abort_transaction(trans, root, ret); +				return ret; +			} +		} +		ret = 0;  	}  	/* insert an orphan item to track subvolume contains orphan files */  	if (insert >= 2) {  		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,  					       root->root_key.objectid); -		BUG_ON(ret); +		if (ret && ret != -EEXIST) { +			btrfs_abort_transaction(trans, root, ret); +			return ret; +		}  	}  	return 0;  } @@ -2226,7 +3077,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)   * We have done the truncate/delete so we can go ahead and remove the orphan   * item for this particular inode.   */ -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, +			    struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int delete_item = 0; @@ -2234,46 +3086,50 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)  	int ret = 0;  	spin_lock(&root->orphan_lock); -	if (!list_empty(&BTRFS_I(inode)->i_orphan)) { -		list_del_init(&BTRFS_I(inode)->i_orphan); +	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			       &BTRFS_I(inode)->runtime_flags))  		delete_item = 1; -	} -	if (BTRFS_I(inode)->orphan_meta_reserved) { -		BTRFS_I(inode)->orphan_meta_reserved = 0; +	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, +			       &BTRFS_I(inode)->runtime_flags))  		release_rsv = 1; -	}  	spin_unlock(&root->orphan_lock); -	if (trans && delete_item) { -		ret = btrfs_del_orphan_item(trans, root, inode->i_ino); -		BUG_ON(ret); +	if (delete_item) { +		atomic_dec(&root->orphan_inodes); +		if (trans) +			ret = btrfs_del_orphan_item(trans, root, +						    btrfs_ino(inode));  	}  	if (release_rsv)  		btrfs_orphan_release_metadata(inode); -	return 0; +	return ret;  }  /*   * this cleans up any orphans that may be left on the list from the last use   * of this root.   */ -void btrfs_orphan_cleanup(struct btrfs_root *root) +int btrfs_orphan_cleanup(struct btrfs_root *root)  {  	struct btrfs_path *path;  	struct extent_buffer *leaf;  	struct btrfs_key key, found_key;  	struct btrfs_trans_handle *trans;  	struct inode *inode; +	u64 last_objectid = 0;  	int ret = 0, nr_unlink = 0, nr_truncate = 0;  	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) -		return; +		return 0;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	}  	path->reada = -1;  	key.objectid = BTRFS_ORPHAN_OBJECTID; @@ -2282,18 +3138,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)  	while (1) {  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -		if (ret < 0) { -			printk(KERN_ERR "Error searching slot for orphan: %d" -			       "\n", ret); -			break; -		} +		if (ret < 0) +			goto out;  		/*  		 * if ret == 0 means we found what we were searching for, which -		 * is weird, but possible, so only screw with path if we didnt +		 * is weird, but possible, so only screw with path if we didn't  		 * find the key and see if we have stuff that matches  		 */  		if (ret > 0) { +			ret = 0;  			if (path->slots[0] == 0)  				break;  			path->slots[0]--; @@ -2310,53 +3164,127 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)  			break;  		/* release the path since we're done with it */ -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		/*  		 * this is where we are basically btrfs_lookup, without the  		 * crossing root thing.  we store the inode number in the  		 * offset of the orphan item.  		 */ + +		if (found_key.offset == last_objectid) { +			btrfs_err(root->fs_info, +				"Error removing orphan entry, stopping orphan cleanup"); +			ret = -EINVAL; +			goto out; +		} + +		last_objectid = found_key.offset; +  		found_key.objectid = found_key.offset;  		found_key.type = BTRFS_INODE_ITEM_KEY;  		found_key.offset = 0;  		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); -		BUG_ON(IS_ERR(inode)); +		ret = PTR_ERR_OR_ZERO(inode); +		if (ret && ret != -ESTALE) +			goto out; -		/* -		 * add this inode to the orphan list so btrfs_orphan_del does -		 * the proper thing when we hit it -		 */ -		spin_lock(&root->orphan_lock); -		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); -		spin_unlock(&root->orphan_lock); +		if (ret == -ESTALE && root == root->fs_info->tree_root) { +			struct btrfs_root *dead_root; +			struct btrfs_fs_info *fs_info = root->fs_info; +			int is_dead_root = 0; +			/* +			 * this is an orphan in the tree root. Currently these +			 * could come from 2 sources: +			 *  a) a snapshot deletion in progress +			 *  b) a free space cache inode +			 * We need to distinguish those two, as the snapshot +			 * orphan must not get deleted. +			 * find_dead_roots already ran before us, so if this +			 * is a snapshot deletion, we should find the root +			 * in the dead_roots list +			 */ +			spin_lock(&fs_info->trans_lock); +			list_for_each_entry(dead_root, &fs_info->dead_roots, +					    root_list) { +				if (dead_root->root_key.objectid == +				    found_key.objectid) { +					is_dead_root = 1; +					break; +				} +			} +			spin_unlock(&fs_info->trans_lock); +			if (is_dead_root) { +				/* prevent this orphan from being found again */ +				key.offset = found_key.objectid - 1; +				continue; +			} +		}  		/* -		 * if this is a bad inode, means we actually succeeded in -		 * removing the inode, but not the orphan record, which means -		 * we need to manually delete the orphan since iput will just -		 * do a destroy_inode +		 * Inode is already gone but the orphan item is still there, +		 * kill the orphan item.  		 */ -		if (is_bad_inode(inode)) { -			trans = btrfs_start_transaction(root, 0); -			btrfs_orphan_del(trans, inode); +		if (ret == -ESTALE) { +			trans = btrfs_start_transaction(root, 1); +			if (IS_ERR(trans)) { +				ret = PTR_ERR(trans); +				goto out; +			} +			btrfs_debug(root->fs_info, "auto deleting %Lu", +				found_key.objectid); +			ret = btrfs_del_orphan_item(trans, root, +						    found_key.objectid);  			btrfs_end_transaction(trans, root); -			iput(inode); +			if (ret) +				goto out;  			continue;  		} +		/* +		 * add this inode to the orphan list so btrfs_orphan_del does +		 * the proper thing when we hit it +		 */ +		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +			&BTRFS_I(inode)->runtime_flags); +		atomic_inc(&root->orphan_inodes); +  		/* if we have links, this was a truncate, lets do that */  		if (inode->i_nlink) { +			if (WARN_ON(!S_ISREG(inode->i_mode))) { +				iput(inode); +				continue; +			}  			nr_truncate++; -			btrfs_truncate(inode); + +			/* 1 for the orphan item deletion. */ +			trans = btrfs_start_transaction(root, 1); +			if (IS_ERR(trans)) { +				iput(inode); +				ret = PTR_ERR(trans); +				goto out; +			} +			ret = btrfs_orphan_add(trans, inode); +			btrfs_end_transaction(trans, root); +			if (ret) { +				iput(inode); +				goto out; +			} + +			ret = btrfs_truncate(inode); +			if (ret) +				btrfs_orphan_del(NULL, inode);  		} else {  			nr_unlink++;  		}  		/* this will do delete_inode and everything for us */  		iput(inode); +		if (ret) +			goto out;  	} -	btrfs_free_path(path); +	/* release the path since we're done with it */ +	btrfs_release_path(path);  	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; @@ -2364,15 +3292,24 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)  		btrfs_block_rsv_release(root, root->orphan_block_rsv,  					(u64)-1); -	if (root->orphan_block_rsv || root->orphan_item_inserted) { -		trans = btrfs_join_transaction(root, 1); -		btrfs_end_transaction(trans, root); +	if (root->orphan_block_rsv || +	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { +		trans = btrfs_join_transaction(root); +		if (!IS_ERR(trans)) +			btrfs_end_transaction(trans, root);  	}  	if (nr_unlink) -		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); +		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);  	if (nr_truncate) -		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); +		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); + +out: +	if (ret) +		btrfs_crit(root->fs_info, +			"could not do orphan cleanup %d", ret); +	btrfs_free_path(path); +	return ret;  }  /* @@ -2382,13 +3319,24 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)   * slot is the slot the inode is in, objectid is the objectid of the inode   */  static noinline int acls_after_inode_item(struct extent_buffer *leaf, -					  int slot, u64 objectid) +					  int slot, u64 objectid, +					  int *first_xattr_slot)  {  	u32 nritems = btrfs_header_nritems(leaf);  	struct btrfs_key found_key; +	static u64 xattr_access = 0; +	static u64 xattr_default = 0;  	int scanned = 0; +	if (!xattr_access) { +		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, +					strlen(POSIX_ACL_XATTR_ACCESS)); +		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, +					strlen(POSIX_ACL_XATTR_DEFAULT)); +	} +  	slot++; +	*first_xattr_slot = -1;  	while (slot < nritems) {  		btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -2397,8 +3345,13 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,  			return 0;  		/* we found an xattr, assume we've got an acl */ -		if (found_key.type == BTRFS_XATTR_ITEM_KEY) -			return 1; +		if (found_key.type == BTRFS_XATTR_ITEM_KEY) { +			if (*first_xattr_slot == -1) +				*first_xattr_slot = slot; +			if (found_key.offset == xattr_access || +			    found_key.offset == xattr_default) +				return 1; +		}  		/*  		 * we found a key greater than an xattr key, there can't @@ -2423,6 +3376,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,  	 * something larger than an xattr.  We have to assume the inode  	 * has acls  	 */ +	if (*first_xattr_slot == -1) +		*first_xattr_slot = slot;  	return 1;  } @@ -2437,13 +3392,21 @@ static void btrfs_read_locked_inode(struct inode *inode)  	struct btrfs_timespec *tspec;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_key location; +	unsigned long ptr;  	int maybe_acls; -	u64 alloc_group_block;  	u32 rdev;  	int ret; +	bool filled = false; +	int first_xattr_slot; + +	ret = btrfs_fill_inode(inode, &rdev); +	if (!ret) +		filled = true;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		goto make_bad; +  	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));  	ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -2451,13 +3414,16 @@ static void btrfs_read_locked_inode(struct inode *inode)  		goto make_bad;  	leaf = path->nodes[0]; + +	if (filled) +		goto cache_index; +  	inode_item = btrfs_item_ptr(leaf, path->slots[0],  				    struct btrfs_inode_item); -  	inode->i_mode = btrfs_inode_mode(leaf, inode_item); -	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); -	inode->i_uid = btrfs_inode_uid(leaf, inode_item); -	inode->i_gid = btrfs_inode_gid(leaf, inode_item); +	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); +	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); +	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));  	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));  	tspec = btrfs_inode_atime(inode_item); @@ -2474,7 +3440,19 @@ static void btrfs_read_locked_inode(struct inode *inode)  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); -	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); +	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + +	/* +	 * If we were modified in the current generation and evicted from memory +	 * and then re-read we need to do a full sync since we don't have any +	 * idea about which extents were modified before we were evicted from +	 * cache. +	 */ +	if (BTRFS_I(inode)->last_trans == root->fs_info->generation) +		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +			&BTRFS_I(inode)->runtime_flags); + +	inode->i_version = btrfs_inode_sequence(leaf, inode_item);  	inode->i_generation = BTRFS_I(inode)->generation;  	inode->i_rdev = 0;  	rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2482,21 +3460,50 @@ static void btrfs_read_locked_inode(struct inode *inode)  	BTRFS_I(inode)->index_cnt = (u64)-1;  	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); -	alloc_group_block = btrfs_inode_block_group(leaf, inode_item); +cache_index: +	path->slots[0]++; +	if (inode->i_nlink != 1 || +	    path->slots[0] >= btrfs_header_nritems(leaf)) +		goto cache_acl; +	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); +	if (location.objectid != btrfs_ino(inode)) +		goto cache_acl; + +	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); +	if (location.type == BTRFS_INODE_REF_KEY) { +		struct btrfs_inode_ref *ref; + +		ref = (struct btrfs_inode_ref *)ptr; +		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); +	} else if (location.type == BTRFS_INODE_EXTREF_KEY) { +		struct btrfs_inode_extref *extref; + +		extref = (struct btrfs_inode_extref *)ptr; +		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, +								     extref); +	} +cache_acl:  	/*  	 * try to precache a NULL acl entry for files that don't have  	 * any xattrs or acls  	 */ -	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); +	maybe_acls = acls_after_inode_item(leaf, path->slots[0], +					   btrfs_ino(inode), &first_xattr_slot); +	if (first_xattr_slot != -1) { +		path->slots[0] = first_xattr_slot; +		ret = btrfs_load_inode_props(inode, path); +		if (ret) +			btrfs_err(root->fs_info, +				  "error loading props for ino %llu (root %llu): %d", +				  btrfs_ino(inode), +				  root->root_key.objectid, ret); +	} +	btrfs_free_path(path); +  	if (!maybe_acls)  		cache_no_acl(inode); -	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, -						alloc_group_block, 0); -	btrfs_free_path(path); -	inode_item = NULL; -  	switch (inode->i_mode & S_IFMT) {  	case S_IFREG:  		inode->i_mapping->a_ops = &btrfs_aops; @@ -2539,40 +3546,47 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct btrfs_inode_item *item,  			    struct inode *inode)  { -	btrfs_set_inode_uid(leaf, item, inode->i_uid); -	btrfs_set_inode_gid(leaf, item, inode->i_gid); -	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); -	btrfs_set_inode_mode(leaf, item, inode->i_mode); -	btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), -			       inode->i_atime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), -				inode->i_atime.tv_nsec); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), -			       inode->i_mtime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), -				inode->i_mtime.tv_nsec); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), -			       inode->i_ctime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), -				inode->i_ctime.tv_nsec); - -	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); -	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); -	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); -	btrfs_set_inode_transid(leaf, item, trans->transid); -	btrfs_set_inode_rdev(leaf, item, inode->i_rdev); -	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); -	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); +	struct btrfs_map_token token; + +	btrfs_init_map_token(&token); + +	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); +	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); +	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, +				   &token); +	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); +	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), +				     inode->i_atime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), +				      inode->i_atime.tv_nsec, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), +				     inode->i_mtime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), +				      inode->i_mtime.tv_nsec, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), +				     inode->i_ctime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), +				      inode->i_ctime.tv_nsec, &token); + +	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), +				     &token); +	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, +					 &token); +	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); +	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); +	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); +	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); +	btrfs_set_token_inode_block_group(leaf, item, 0, &token);  }  /*   * copy everything in the in-memory inode into the btree.   */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, struct inode *inode)  {  	struct btrfs_inode_item *inode_item; @@ -2581,20 +3595,21 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,  	int ret;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; +  	path->leave_spinning = 1; -	ret = btrfs_lookup_inode(trans, root, path, -				 &BTRFS_I(inode)->location, 1); +	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, +				 1);  	if (ret) {  		if (ret > 0)  			ret = -ENOENT;  		goto failed;  	} -	btrfs_unlock_up_safe(path, 1);  	leaf = path->nodes[0];  	inode_item = btrfs_item_ptr(leaf, path->slots[0], -				  struct btrfs_inode_item); +				    struct btrfs_inode_item);  	fill_inode_item(trans, leaf, inode_item, inode);  	btrfs_mark_buffer_dirty(leaf); @@ -2605,16 +3620,55 @@ failed:  	return ret;  } +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, struct inode *inode) +{ +	int ret; + +	/* +	 * If the inode is a free space inode, we can deadlock during commit +	 * if we put it into the delayed code. +	 * +	 * The data relocation inode should also be directly updated +	 * without delay +	 */ +	if (!btrfs_is_free_space_inode(inode) +	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { +		btrfs_update_root_times(trans, root); + +		ret = btrfs_delayed_update_inode(trans, root, inode); +		if (!ret) +			btrfs_set_inode_last_trans(trans, inode); +		return ret; +	} + +	return btrfs_update_inode_item(trans, root, inode); +} + +noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, +					 struct btrfs_root *root, +					 struct inode *inode) +{ +	int ret; + +	ret = btrfs_update_inode(trans, root, inode); +	if (ret == -ENOSPC) +		return btrfs_update_inode_item(trans, root, inode); +	return ret; +}  /*   * unlink helper that gets used here in inode.c and in the tree logging   * recovery code.  It remove a link in a directory with a given name, and   * also drops the back refs in the inode to the directory   */ -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, -		       struct btrfs_root *root, -		       struct inode *dir, struct inode *inode, -		       const char *name, int name_len) +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, +				struct btrfs_root *root, +				struct inode *dir, struct inode *inode, +				const char *name, int name_len)  {  	struct btrfs_path *path;  	int ret = 0; @@ -2622,15 +3676,17 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,  	struct btrfs_dir_item *di;  	struct btrfs_key key;  	u64 index; +	u64 ino = btrfs_ino(inode); +	u64 dir_ino = btrfs_ino(dir);  	path = btrfs_alloc_path();  	if (!path) {  		ret = -ENOMEM; -		goto err; +		goto out;  	}  	path->leave_spinning = 1; -	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, +	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,  				    name, name_len, -1);  	if (IS_ERR(di)) {  		ret = PTR_ERR(di); @@ -2645,265 +3701,154 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,  	ret = btrfs_delete_one_dir_name(trans, root, path, di);  	if (ret)  		goto err; -	btrfs_release_path(root, path); +	btrfs_release_path(path); -	ret = btrfs_del_inode_ref(trans, root, name, name_len, -				  inode->i_ino, -				  dir->i_ino, &index); -	if (ret) { -		printk(KERN_INFO "btrfs failed to delete reference to %.*s, " -		       "inode %lu parent %lu\n", name_len, name, -		       inode->i_ino, dir->i_ino); -		goto err; +	/* +	 * If we don't have dir index, we have to get it by looking up +	 * the inode ref, since we get the inode ref, remove it directly, +	 * it is unnecessary to do delayed deletion. +	 * +	 * But if we have dir index, needn't search inode ref to get it. +	 * Since the inode ref is close to the inode item, it is better +	 * that we delay to delete it, and just do this deletion when +	 * we update the inode item. +	 */ +	if (BTRFS_I(inode)->dir_index) { +		ret = btrfs_delayed_delete_inode_ref(inode); +		if (!ret) { +			index = BTRFS_I(inode)->dir_index; +			goto skip_backref; +		}  	} -	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, -					 index, name, name_len, -1); -	if (IS_ERR(di)) { -		ret = PTR_ERR(di); +	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, +				  dir_ino, &index); +	if (ret) { +		btrfs_info(root->fs_info, +			"failed to delete reference to %.*s, inode %llu parent %llu", +			name_len, name, ino, dir_ino); +		btrfs_abort_transaction(trans, root, ret);  		goto err;  	} -	if (!di) { -		ret = -ENOENT; +skip_backref: +	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret);  		goto err;  	} -	ret = btrfs_delete_one_dir_name(trans, root, path, di); -	btrfs_release_path(root, path);  	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, -					 inode, dir->i_ino); -	BUG_ON(ret != 0 && ret != -ENOENT); +					 inode, dir_ino); +	if (ret != 0 && ret != -ENOENT) { +		btrfs_abort_transaction(trans, root, ret); +		goto err; +	}  	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,  					   dir, index);  	if (ret == -ENOENT)  		ret = 0; +	else if (ret) +		btrfs_abort_transaction(trans, root, ret);  err:  	btrfs_free_path(path);  	if (ret)  		goto out;  	btrfs_i_size_write(dir, dir->i_size - name_len * 2); +	inode_inc_iversion(inode); +	inode_inc_iversion(dir);  	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; -	btrfs_update_inode(trans, root, dir); -	btrfs_drop_nlink(inode); -	ret = btrfs_update_inode(trans, root, inode); +	ret = btrfs_update_inode(trans, root, dir);  out:  	return ret;  } -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, -			     struct btrfs_path *path) +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, +		       struct btrfs_root *root, +		       struct inode *dir, struct inode *inode, +		       const char *name, int name_len)  { -	struct extent_buffer *eb; -	int level; -	u64 refs = 1; -	int uninitialized_var(ret); - -	for (level = 0; level < BTRFS_MAX_LEVEL; level++) { -		if (!path->nodes[level]) -			break; -		eb = path->nodes[level]; -		if (!btrfs_block_can_be_shared(root, eb)) -			continue; -		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, -					       &refs, NULL); -		if (refs > 1) -			return 1; +	int ret; +	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); +	if (!ret) { +		drop_nlink(inode); +		ret = btrfs_update_inode(trans, root, inode);  	} -	return ret; /* XXX callers? */ +	return ret;  }  /*   * helper to start transaction for unlink and rmdir.   * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur.   */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, -						       struct dentry *dentry) +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root; -	struct btrfs_path *path; -	struct btrfs_inode_ref *ref; -	struct btrfs_dir_item *di; -	struct inode *inode = dentry->d_inode; -	u64 index; -	int check_link = 1; -	int err = -ENOSPC;  	int ret; -	trans = btrfs_start_transaction(root, 10); +	/* +	 * 1 for the possible orphan item +	 * 1 for the dir item +	 * 1 for the dir index +	 * 1 for the inode ref +	 * 1 for the inode +	 */ +	trans = btrfs_start_transaction(root, 5);  	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)  		return trans; -	if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) -		return ERR_PTR(-ENOSPC); - -	/* check if there is someone else holds reference */ -	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) -		return ERR_PTR(-ENOSPC); - -	if (atomic_read(&inode->i_count) > 2) -		return ERR_PTR(-ENOSPC); - -	if (xchg(&root->fs_info->enospc_unlink, 1)) -		return ERR_PTR(-ENOSPC); - -	path = btrfs_alloc_path(); -	if (!path) { -		root->fs_info->enospc_unlink = 0; -		return ERR_PTR(-ENOMEM); -	} - -	trans = btrfs_start_transaction(root, 0); -	if (IS_ERR(trans)) { -		btrfs_free_path(path); -		root->fs_info->enospc_unlink = 0; -		return trans; -	} - -	path->skip_locking = 1; -	path->search_commit_root = 1; - -	ret = btrfs_lookup_inode(trans, root, path, -				&BTRFS_I(dir)->location, 0); -	if (ret < 0) { -		err = ret; -		goto out; -	} -	if (ret == 0) { -		if (check_path_shared(root, path)) -			goto out; -	} else { -		check_link = 0; -	} -	btrfs_release_path(root, path); - -	ret = btrfs_lookup_inode(trans, root, path, -				&BTRFS_I(inode)->location, 0); -	if (ret < 0) { -		err = ret; -		goto out; -	} -	if (ret == 0) { -		if (check_path_shared(root, path)) -			goto out; -	} else { -		check_link = 0; -	} -	btrfs_release_path(root, path); +	if (PTR_ERR(trans) == -ENOSPC) { +		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); -	if (ret == 0 && S_ISREG(inode->i_mode)) { -		ret = btrfs_lookup_file_extent(trans, root, path, -					       inode->i_ino, (u64)-1, 0); -		if (ret < 0) { -			err = ret; -			goto out; +		trans = btrfs_start_transaction(root, 0); +		if (IS_ERR(trans)) +			return trans; +		ret = btrfs_cond_migrate_bytes(root->fs_info, +					       &root->fs_info->trans_block_rsv, +					       num_bytes, 5); +		if (ret) { +			btrfs_end_transaction(trans, root); +			return ERR_PTR(ret);  		} -		BUG_ON(ret == 0); -		if (check_path_shared(root, path)) -			goto out; -		btrfs_release_path(root, path); -	} - -	if (!check_link) { -		err = 0; -		goto out; -	} - -	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, -				dentry->d_name.name, dentry->d_name.len, 0); -	if (IS_ERR(di)) { -		err = PTR_ERR(di); -		goto out; -	} -	if (di) { -		if (check_path_shared(root, path)) -			goto out; -	} else { -		err = 0; -		goto out; -	} -	btrfs_release_path(root, path); - -	ref = btrfs_lookup_inode_ref(trans, root, path, -				dentry->d_name.name, dentry->d_name.len, -				inode->i_ino, dir->i_ino, 0); -	if (IS_ERR(ref)) { -		err = PTR_ERR(ref); -		goto out; -	} -	BUG_ON(!ref); -	if (check_path_shared(root, path)) -		goto out; -	index = btrfs_inode_ref_index(path->nodes[0], ref); -	btrfs_release_path(root, path); - -	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, -				dentry->d_name.name, dentry->d_name.len, 0); -	if (IS_ERR(di)) { -		err = PTR_ERR(di); -		goto out; -	} -	BUG_ON(ret == -ENOENT); -	if (check_path_shared(root, path)) -		goto out; - -	err = 0; -out: -	btrfs_free_path(path); -	if (err) { -		btrfs_end_transaction(trans, root); -		root->fs_info->enospc_unlink = 0; -		return ERR_PTR(err); +		trans->block_rsv = &root->fs_info->trans_block_rsv; +		trans->bytes_reserved = num_bytes;  	} - -	trans->block_rsv = &root->fs_info->global_block_rsv;  	return trans;  } -static void __unlink_end_trans(struct btrfs_trans_handle *trans, -			       struct btrfs_root *root) -{ -	if (trans->block_rsv == &root->fs_info->global_block_rsv) { -		BUG_ON(!root->fs_info->enospc_unlink); -		root->fs_info->enospc_unlink = 0; -	} -	btrfs_end_transaction_throttle(trans, root); -} -  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)  {  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_trans_handle *trans;  	struct inode *inode = dentry->d_inode;  	int ret; -	unsigned long nr = 0; -	trans = __unlink_start_trans(dir, dentry); +	trans = __unlink_start_trans(dir);  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); -  	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);  	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,  				 dentry->d_name.name, dentry->d_name.len); -	BUG_ON(ret); +	if (ret) +		goto out;  	if (inode->i_nlink == 0) {  		ret = btrfs_orphan_add(trans, inode); -		BUG_ON(ret); +		if (ret) +			goto out;  	} -	nr = trans->blocks_used; -	__unlink_end_trans(trans, root); -	btrfs_btree_balance_dirty(root, nr); +out: +	btrfs_end_transaction(trans, root); +	btrfs_btree_balance_dirty(root);  	return ret;  } @@ -2918,55 +3863,73 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,  	struct btrfs_key key;  	u64 index;  	int ret; +	u64 dir_ino = btrfs_ino(dir);  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, +	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,  				   name, name_len, -1); -	BUG_ON(!di || IS_ERR(di)); +	if (IS_ERR_OR_NULL(di)) { +		if (!di) +			ret = -ENOENT; +		else +			ret = PTR_ERR(di); +		goto out; +	}  	leaf = path->nodes[0];  	btrfs_dir_item_key_to_cpu(leaf, di, &key);  	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);  	ret = btrfs_delete_one_dir_name(trans, root, path, di); -	BUG_ON(ret); -	btrfs_release_path(root, path); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	} +	btrfs_release_path(path);  	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,  				 objectid, root->root_key.objectid, -				 dir->i_ino, &index, name, name_len); +				 dir_ino, &index, name, name_len);  	if (ret < 0) { -		BUG_ON(ret != -ENOENT); -		di = btrfs_search_dir_index_item(root, path, dir->i_ino, +		if (ret != -ENOENT) { +			btrfs_abort_transaction(trans, root, ret); +			goto out; +		} +		di = btrfs_search_dir_index_item(root, path, dir_ino,  						 name, name_len); -		BUG_ON(!di || IS_ERR(di)); +		if (IS_ERR_OR_NULL(di)) { +			if (!di) +				ret = -ENOENT; +			else +				ret = PTR_ERR(di); +			btrfs_abort_transaction(trans, root, ret); +			goto out; +		}  		leaf = path->nodes[0];  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); -		btrfs_release_path(root, path); +		btrfs_release_path(path);  		index = key.offset;  	} +	btrfs_release_path(path); -	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, -					 index, name, name_len, -1); -	BUG_ON(!di || IS_ERR(di)); - -	leaf = path->nodes[0]; -	btrfs_dir_item_key_to_cpu(leaf, di, &key); -	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); -	ret = btrfs_delete_one_dir_name(trans, root, path, di); -	BUG_ON(ret); -	btrfs_release_path(root, path); +	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out; +	}  	btrfs_i_size_write(dir, dir->i_size - name_len * 2); +	inode_inc_iversion(dir);  	dir->i_mtime = dir->i_ctime = CURRENT_TIME; -	ret = btrfs_update_inode(trans, root, dir); -	BUG_ON(ret); - +	ret = btrfs_update_inode_fallback(trans, root, dir); +	if (ret) +		btrfs_abort_transaction(trans, root, ret); +out:  	btrfs_free_path(path); -	return 0; +	return ret;  }  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -2975,19 +3938,17 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  	int err = 0;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_trans_handle *trans; -	unsigned long nr = 0; -	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || -	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)  		return -ENOTEMPTY; +	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) +		return -EPERM; -	trans = __unlink_start_trans(dir, dentry); +	trans = __unlink_start_trans(dir);  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); - -	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { +	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {  		err = btrfs_unlink_subvol(trans, root, dir,  					  BTRFS_I(inode)->location.objectid,  					  dentry->d_name.name, @@ -3005,185 +3966,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  	if (!err)  		btrfs_i_size_write(inode, 0);  out: -	nr = trans->blocks_used; -	__unlink_end_trans(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_end_transaction(trans, root); +	btrfs_btree_balance_dirty(root);  	return err;  } -#if 0 -/* - * when truncating bytes in a file, it is possible to avoid reading - * the leaves that contain only checksum items.  This can be the - * majority of the IO required to delete a large file, but it must - * be done carefully. - * - * The keys in the level just above the leaves are checked to make sure - * the lowest key in a given leaf is a csum key, and starts at an offset - * after the new  size. - * - * Then the key for the next leaf is checked to make sure it also has - * a checksum item for the same file.  If it does, we know our target leaf - * contains only checksum items, and it can be safely freed without reading - * it. - * - * This is just an optimization targeted at large files.  It may do - * nothing.  It will return 0 unless things went badly. - */ -static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, -				     struct btrfs_root *root, -				     struct btrfs_path *path, -				     struct inode *inode, u64 new_size) -{ -	struct btrfs_key key; -	int ret; -	int nritems; -	struct btrfs_key found_key; -	struct btrfs_key other_key; -	struct btrfs_leaf_ref *ref; -	u64 leaf_gen; -	u64 leaf_start; - -	path->lowest_level = 1; -	key.objectid = inode->i_ino; -	key.type = BTRFS_CSUM_ITEM_KEY; -	key.offset = new_size; -again: -	ret = btrfs_search_slot(trans, root, &key, path, -1, 1); -	if (ret < 0) -		goto out; - -	if (path->nodes[1] == NULL) { -		ret = 0; -		goto out; -	} -	ret = 0; -	btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); -	nritems = btrfs_header_nritems(path->nodes[1]); - -	if (!nritems) -		goto out; - -	if (path->slots[1] >= nritems) -		goto next_node; - -	/* did we find a key greater than anything we want to delete? */ -	if (found_key.objectid > inode->i_ino || -	   (found_key.objectid == inode->i_ino && found_key.type > key.type)) -		goto out; - -	/* we check the next key in the node to make sure the leave contains -	 * only checksum items.  This comparison doesn't work if our -	 * leaf is the last one in the node -	 */ -	if (path->slots[1] + 1 >= nritems) { -next_node: -		/* search forward from the last key in the node, this -		 * will bring us into the next node in the tree -		 */ -		btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); - -		/* unlikely, but we inc below, so check to be safe */ -		if (found_key.offset == (u64)-1) -			goto out; - -		/* search_forward needs a path with locks held, do the -		 * search again for the original key.  It is possible -		 * this will race with a balance and return a path that -		 * we could modify, but this drop is just an optimization -		 * and is allowed to miss some leaves. -		 */ -		btrfs_release_path(root, path); -		found_key.offset++; - -		/* setup a max key for search_forward */ -		other_key.offset = (u64)-1; -		other_key.type = key.type; -		other_key.objectid = key.objectid; - -		path->keep_locks = 1; -		ret = btrfs_search_forward(root, &found_key, &other_key, -					   path, 0, 0); -		path->keep_locks = 0; -		if (ret || found_key.objectid != key.objectid || -		    found_key.type != key.type) { -			ret = 0; -			goto out; -		} - -		key.offset = found_key.offset; -		btrfs_release_path(root, path); -		cond_resched(); -		goto again; -	} - -	/* we know there's one more slot after us in the tree, -	 * read that key so we can verify it is also a checksum item -	 */ -	btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); - -	if (found_key.objectid < inode->i_ino) -		goto next_key; - -	if (found_key.type != key.type || found_key.offset < new_size) -		goto next_key; - -	/* -	 * if the key for the next leaf isn't a csum key from this objectid, -	 * we can't be sure there aren't good items inside this leaf. -	 * Bail out -	 */ -	if (other_key.objectid != inode->i_ino || other_key.type != key.type) -		goto out; - -	leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); -	leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); -	/* -	 * it is safe to delete this leaf, it contains only -	 * csum items from this inode at an offset >= new_size -	 */ -	ret = btrfs_del_leaf(trans, root, path, leaf_start); -	BUG_ON(ret); - -	if (root->ref_cows && leaf_gen < trans->transid) { -		ref = btrfs_alloc_leaf_ref(root, 0); -		if (ref) { -			ref->root_gen = root->root_key.offset; -			ref->bytenr = leaf_start; -			ref->owner = 0; -			ref->generation = leaf_gen; -			ref->nritems = 0; - -			btrfs_sort_leaf_ref(ref); - -			ret = btrfs_add_leaf_ref(root, ref, 0); -			WARN_ON(ret); -			btrfs_free_leaf_ref(root, ref); -		} else { -			WARN_ON(1); -		} -	} -next_key: -	btrfs_release_path(root, path); - -	if (other_key.objectid == inode->i_ino && -	    other_key.type == key.type && other_key.offset > key.offset) { -		key.offset = other_key.offset; -		cond_resched(); -		goto again; -	} -	ret = 0; -out: -	/* fixup any changes we've made to the path */ -	path->lowest_level = 0; -	path->keep_locks = 0; -	btrfs_release_path(root, path); -	return ret; -} - -#endif -  /*   * this can truncate away extent items, csum items and directory items.   * It starts at a high offset and removes keys until it can't find @@ -3209,27 +3997,44 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,  	u64 extent_num_bytes = 0;  	u64 extent_offset = 0;  	u64 item_end = 0; -	u64 mask = root->sectorsize - 1; +	u64 last_size = (u64)-1;  	u32 found_type = (u8)-1;  	int found_extent;  	int del_item;  	int pending_del_nr = 0;  	int pending_del_slot = 0;  	int extent_type = -1; -	int encoding;  	int ret;  	int err = 0; +	u64 ino = btrfs_ino(inode);  	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); -	if (root->ref_cows || root == root->fs_info->tree_root) -		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); -  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM;  	path->reada = -1; -	key.objectid = inode->i_ino; +	/* +	 * We want to drop from the next block forward in case this new size is +	 * not block aligned since we will be keeping the last block of the +	 * extent just the way it is. +	 */ +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || +	    root == root->fs_info->tree_root) +		btrfs_drop_extent_cache(inode, ALIGN(new_size, +					root->sectorsize), (u64)-1, 0); + +	/* +	 * This function is also used to drop the items in the log tree before +	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means +	 * it is used to drop the loged items. So we shouldn't kill the delayed +	 * items. +	 */ +	if (min_type == 0 && root == BTRFS_I(inode)->root) +		btrfs_kill_delayed_inode_items(inode); + +	key.objectid = ino;  	key.offset = (u64)-1;  	key.type = (u8)-1; @@ -3255,9 +4060,8 @@ search_again:  		leaf = path->nodes[0];  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);  		found_type = btrfs_key_type(&found_key); -		encoding = 0; -		if (found_key.objectid != inode->i_ino) +		if (found_key.objectid != ino)  			break;  		if (found_type < min_type) @@ -3268,16 +4072,12 @@ search_again:  			fi = btrfs_item_ptr(leaf, path->slots[0],  					    struct btrfs_file_extent_item);  			extent_type = btrfs_file_extent_type(leaf, fi); -			encoding = btrfs_file_extent_compression(leaf, fi); -			encoding |= btrfs_file_extent_encryption(leaf, fi); -			encoding |= btrfs_file_extent_other_encoding(leaf, fi); -  			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {  				item_end +=  				    btrfs_file_extent_num_bytes(leaf, fi);  			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  				item_end += btrfs_file_extent_inline_len(leaf, -									 fi); +							 path->slots[0], fi);  			}  			item_end--;  		} @@ -3296,21 +4096,27 @@ search_again:  		if (found_type != BTRFS_EXTENT_DATA_KEY)  			goto delete; +		if (del_item) +			last_size = found_key.offset; +		else +			last_size = new_size; +  		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {  			u64 num_dec;  			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); -			if (!del_item && !encoding) { +			if (!del_item) {  				u64 orig_num_bytes =  					btrfs_file_extent_num_bytes(leaf, fi); -				extent_num_bytes = new_size - -					found_key.offset + root->sectorsize - 1; -				extent_num_bytes = extent_num_bytes & -					~((u64)root->sectorsize - 1); +				extent_num_bytes = ALIGN(new_size - +						found_key.offset, +						root->sectorsize);  				btrfs_set_file_extent_num_bytes(leaf, fi,  							 extent_num_bytes);  				num_dec = (orig_num_bytes -  					   extent_num_bytes); -				if (root->ref_cows && extent_start != 0) +				if (test_bit(BTRFS_ROOT_REF_COWS, +					     &root->state) && +				    extent_start != 0)  					inode_sub_bytes(inode, num_dec);  				btrfs_mark_buffer_dirty(leaf);  			} else { @@ -3324,7 +4130,8 @@ search_again:  				num_dec = btrfs_file_extent_num_bytes(leaf, fi);  				if (extent_start != 0) {  					found_extent = 1; -					if (root->ref_cows) +					if (test_bit(BTRFS_ROOT_REF_COWS, +						     &root->state))  						inode_sub_bytes(inode, num_dec);  				}  			} @@ -3339,16 +4146,20 @@ search_again:  			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {  				u32 size = new_size - found_key.offset; -				if (root->ref_cows) { +				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  					inode_sub_bytes(inode, item_end + 1 -  							new_size); -				} + +				/* +				 * update the ram bytes to properly reflect +				 * the new size of our item +				 */ +				btrfs_set_file_extent_ram_bytes(leaf, fi, size);  				size =  				    btrfs_file_extent_calc_inline_size(size); -				ret = btrfs_truncate_item(trans, root, path, -							  size, 1); -				BUG_ON(ret); -			} else if (root->ref_cows) { +				btrfs_truncate_item(root, path, size, 1); +			} else if (test_bit(BTRFS_ROOT_REF_COWS, +					    &root->state)) {  				inode_sub_bytes(inode, item_end + 1 -  						found_key.offset);  			} @@ -3370,13 +4181,14 @@ delete:  		} else {  			break;  		} -		if (found_extent && (root->ref_cows || -				     root == root->fs_info->tree_root)) { +		if (found_extent && +		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || +		     root == root->fs_info->tree_root)) {  			btrfs_set_path_blocking(path);  			ret = btrfs_free_extent(trans, root, extent_start,  						extent_num_bytes, 0,  						btrfs_header_owner(leaf), -						inode->i_ino, extent_offset); +						ino, extent_offset, 0);  			BUG_ON(ret);  		} @@ -3385,18 +4197,18 @@ delete:  		if (path->slots[0] == 0 ||  		    path->slots[0] != pending_del_slot) { -			if (root->ref_cows) { -				err = -EAGAIN; -				goto out; -			}  			if (pending_del_nr) {  				ret = btrfs_del_items(trans, root, path,  						pending_del_slot,  						pending_del_nr); -				BUG_ON(ret); +				if (ret) { +					btrfs_abort_transaction(trans, +								root, ret); +					goto error; +				}  				pending_del_nr = 0;  			} -			btrfs_release_path(root, path); +			btrfs_release_path(path);  			goto search_again;  		} else {  			path->slots[0]--; @@ -3406,19 +4218,31 @@ out:  	if (pending_del_nr) {  		ret = btrfs_del_items(trans, root, path, pending_del_slot,  				      pending_del_nr); -		BUG_ON(ret); +		if (ret) +			btrfs_abort_transaction(trans, root, ret);  	} +error: +	if (last_size != (u64)-1) +		btrfs_ordered_update_i_size(inode, last_size, NULL);  	btrfs_free_path(path);  	return err;  }  /* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + *	offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero.  This is used with truncate and hole punching.   */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +			int front)  { -	struct inode *inode = mapping->host; +	struct address_space *mapping = inode->i_mapping;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_ordered_extent *ordered; @@ -3428,21 +4252,23 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)  	pgoff_t index = from >> PAGE_CACHE_SHIFT;  	unsigned offset = from & (PAGE_CACHE_SIZE-1);  	struct page *page; +	gfp_t mask = btrfs_alloc_write_mask(mapping);  	int ret = 0;  	u64 page_start;  	u64 page_end; -	if ((offset & (blocksize - 1)) == 0) +	if ((offset & (blocksize - 1)) == 0 && +	    (!len || ((len & (blocksize - 1)) == 0)))  		goto out;  	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);  	if (ret)  		goto out; -	ret = -ENOMEM;  again: -	page = grab_cache_page(mapping, index); +	page = find_or_create_page(mapping, index, mask);  	if (!page) {  		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +		ret = -ENOMEM;  		goto out;  	} @@ -3464,8 +4290,7 @@ again:  	}  	wait_on_page_writeback(page); -	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, -			 GFP_NOFS); +	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);  	set_page_extent_mapped(page);  	ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -3480,7 +4305,8 @@ again:  	}  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, -			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, +			  EXTENT_DIRTY | EXTENT_DELALLOC | +			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			  0, 0, &cached_state, GFP_NOFS);  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -3491,10 +4317,14 @@ again:  		goto out_unlock;  	} -	ret = 0;  	if (offset != PAGE_CACHE_SIZE) { +		if (!len) +			len = PAGE_CACHE_SIZE - offset;  		kaddr = kmap(page); -		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); +		if (front) +			memset(kaddr, 0, offset); +		else +			memset(kaddr + offset, 0, len);  		flush_dcache_page(page);  		kunmap(page);  	} @@ -3512,35 +4342,93 @@ out:  	return ret;  } -int btrfs_cont_expand(struct inode *inode, loff_t size) +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, +			     u64 offset, u64 len)  {  	struct btrfs_trans_handle *trans; +	int ret; + +	/* +	 * Still need to make sure the inode looks like it's been updated so +	 * that any holes get logged if we fsync. +	 */ +	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { +		BTRFS_I(inode)->last_trans = root->fs_info->generation; +		BTRFS_I(inode)->last_sub_trans = root->log_transid; +		BTRFS_I(inode)->last_log_commit = root->last_log_commit; +		return 0; +	} + +	/* +	 * 1 - for the one we're dropping +	 * 1 - for the one we're adding +	 * 1 - for updating the inode. +	 */ +	trans = btrfs_start_transaction(root, 3); +	if (IS_ERR(trans)) +		return PTR_ERR(trans); + +	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		btrfs_end_transaction(trans, root); +		return ret; +	} + +	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, +				       0, 0, len, 0, len, 0, 0, 0); +	if (ret) +		btrfs_abort_transaction(trans, root, ret); +	else +		btrfs_update_inode(trans, root, inode); +	btrfs_end_transaction(trans, root); +	return ret; +} + +/* + * This function puts in dummy file extents for the area we're creating a hole + * for.  So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +{  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_map *em = NULL;  	struct extent_state *cached_state = NULL; -	u64 mask = root->sectorsize - 1; -	u64 hole_start = (inode->i_size + mask) & ~mask; -	u64 block_end = (size + mask) & ~mask; +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	u64 hole_start = ALIGN(oldsize, root->sectorsize); +	u64 block_end = ALIGN(size, root->sectorsize);  	u64 last_byte;  	u64 cur_offset;  	u64 hole_size;  	int err = 0; +	/* +	 * If our size started in the middle of a page we need to zero out the +	 * rest of the page before we expand the i_size, otherwise we could +	 * expose stale data. +	 */ +	err = btrfs_truncate_page(inode, oldsize, 0, 0); +	if (err) +		return err; +  	if (size <= hole_start)  		return 0;  	while (1) {  		struct btrfs_ordered_extent *ordered; -		btrfs_wait_ordered_range(inode, hole_start, -					 block_end - hole_start); +  		lock_extent_bits(io_tree, hole_start, block_end - 1, 0, -				 &cached_state, GFP_NOFS); -		ordered = btrfs_lookup_ordered_extent(inode, hole_start); +				 &cached_state); +		ordered = btrfs_lookup_ordered_range(inode, hole_start, +						     block_end - hole_start);  		if (!ordered)  			break;  		unlock_extent_cached(io_tree, hole_start, block_end - 1,  				     &cached_state, GFP_NOFS); +		btrfs_start_ordered_extent(inode, ordered, 1);  		btrfs_put_ordered_extent(ordered);  	} @@ -3548,162 +4436,275 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)  	while (1) {  		em = btrfs_get_extent(inode, NULL, 0, cur_offset,  				block_end - cur_offset, 0); -		BUG_ON(IS_ERR(em) || !em); +		if (IS_ERR(em)) { +			err = PTR_ERR(em); +			em = NULL; +			break; +		}  		last_byte = min(extent_map_end(em), block_end); -		last_byte = (last_byte + mask) & ~mask; +		last_byte = ALIGN(last_byte , root->sectorsize);  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { -			u64 hint_byte = 0; +			struct extent_map *hole_em;  			hole_size = last_byte - cur_offset; -			trans = btrfs_start_transaction(root, 2); -			if (IS_ERR(trans)) { -				err = PTR_ERR(trans); +			err = maybe_insert_hole(root, inode, cur_offset, +						hole_size); +			if (err)  				break; +			btrfs_drop_extent_cache(inode, cur_offset, +						cur_offset + hole_size - 1, 0); +			hole_em = alloc_extent_map(); +			if (!hole_em) { +				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +					&BTRFS_I(inode)->runtime_flags); +				goto next;  			} -			btrfs_set_trans_block_group(trans, inode); - -			err = btrfs_drop_extents(trans, inode, cur_offset, -						 cur_offset + hole_size, -						 &hint_byte, 1); -			BUG_ON(err); - -			err = btrfs_insert_file_extent(trans, root, -					inode->i_ino, cur_offset, 0, -					0, hole_size, 0, hole_size, -					0, 0, 0); -			BUG_ON(err); +			hole_em->start = cur_offset; +			hole_em->len = hole_size; +			hole_em->orig_start = cur_offset; + +			hole_em->block_start = EXTENT_MAP_HOLE; +			hole_em->block_len = 0; +			hole_em->orig_block_len = 0; +			hole_em->ram_bytes = hole_size; +			hole_em->bdev = root->fs_info->fs_devices->latest_bdev; +			hole_em->compress_type = BTRFS_COMPRESS_NONE; +			hole_em->generation = root->fs_info->generation; -			btrfs_drop_extent_cache(inode, hole_start, -					last_byte - 1, 0); - -			btrfs_end_transaction(trans, root); +			while (1) { +				write_lock(&em_tree->lock); +				err = add_extent_mapping(em_tree, hole_em, 1); +				write_unlock(&em_tree->lock); +				if (err != -EEXIST) +					break; +				btrfs_drop_extent_cache(inode, cur_offset, +							cur_offset + +							hole_size - 1, 0); +			} +			free_extent_map(hole_em);  		} +next:  		free_extent_map(em);  		em = NULL;  		cur_offset = last_byte;  		if (cur_offset >= block_end)  			break;  	} -  	free_extent_map(em);  	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,  			     GFP_NOFS);  	return err;  } -static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +static int btrfs_setsize(struct inode *inode, struct iattr *attr)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans; -	unsigned long nr; +	loff_t oldsize = i_size_read(inode); +	loff_t newsize = attr->ia_size; +	int mask = attr->ia_valid;  	int ret; -	if (attr->ia_size == inode->i_size) -		return 0; - -	if (attr->ia_size > inode->i_size) { -		unsigned long limit; -		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; -		if (attr->ia_size > inode->i_sb->s_maxbytes) -			return -EFBIG; -		if (limit != RLIM_INFINITY && attr->ia_size > limit) { -			send_sig(SIGXFSZ, current, 0); -			return -EFBIG; -		} +	/* +	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a +	 * special case where we need to update the times despite not having +	 * these flags set.  For all other operations the VFS set these flags +	 * explicitly if it wants a timestamp update. +	 */ +	if (newsize != oldsize) { +		inode_inc_iversion(inode); +		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) +			inode->i_ctime = inode->i_mtime = +				current_fs_time(inode->i_sb);  	} -	trans = btrfs_start_transaction(root, 5); -	if (IS_ERR(trans)) -		return PTR_ERR(trans); +	if (newsize > oldsize) { +		truncate_pagecache(inode, newsize); +		ret = btrfs_cont_expand(inode, oldsize, newsize); +		if (ret) +			return ret; -	btrfs_set_trans_block_group(trans, inode); +		trans = btrfs_start_transaction(root, 1); +		if (IS_ERR(trans)) +			return PTR_ERR(trans); -	ret = btrfs_orphan_add(trans, inode); -	BUG_ON(ret); +		i_size_write(inode, newsize); +		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); +		ret = btrfs_update_inode(trans, root, inode); +		btrfs_end_transaction(trans, root); +	} else { -	nr = trans->blocks_used; -	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +		/* +		 * We're truncating a file that used to have good data down to +		 * zero. Make sure it gets into the ordered flush list so that +		 * any new writes get down to disk quickly. +		 */ +		if (newsize == 0) +			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, +				&BTRFS_I(inode)->runtime_flags); -	if (attr->ia_size > inode->i_size) { -		ret = btrfs_cont_expand(inode, attr->ia_size); -		if (ret) { -			btrfs_truncate(inode); +		/* +		 * 1 for the orphan item we're going to add +		 * 1 for the orphan item deletion. +		 */ +		trans = btrfs_start_transaction(root, 2); +		if (IS_ERR(trans)) +			return PTR_ERR(trans); + +		/* +		 * We need to do this in case we fail at _any_ point during the +		 * actual truncate.  Once we do the truncate_setsize we could +		 * invalidate pages which forces any outstanding ordered io to +		 * be instantly completed which will give us extents that need +		 * to be truncated.  If we fail to get an orphan inode down we +		 * could have left over extents that were never meant to live, +		 * so we need to garuntee from this point on that everything +		 * will be consistent. +		 */ +		ret = btrfs_orphan_add(trans, inode); +		btrfs_end_transaction(trans, root); +		if (ret)  			return ret; -		} -		i_size_write(inode, attr->ia_size); -		btrfs_ordered_update_i_size(inode, inode->i_size, NULL); +		/* we don't support swapfiles, so vmtruncate shouldn't fail */ +		truncate_setsize(inode, newsize); -		trans = btrfs_start_transaction(root, 0); -		BUG_ON(IS_ERR(trans)); -		btrfs_set_trans_block_group(trans, inode); -		trans->block_rsv = root->orphan_block_rsv; -		BUG_ON(!trans->block_rsv); +		/* Disable nonlocked read DIO to avoid the end less truncate */ +		btrfs_inode_block_unlocked_dio(inode); +		inode_dio_wait(inode); +		btrfs_inode_resume_unlocked_dio(inode); -		ret = btrfs_update_inode(trans, root, inode); -		BUG_ON(ret); -		if (inode->i_nlink > 0) { -			ret = btrfs_orphan_del(trans, inode); -			BUG_ON(ret); +		ret = btrfs_truncate(inode); +		if (ret && inode->i_nlink) { +			int err; + +			/* +			 * failed to truncate, disk_i_size is only adjusted down +			 * as we remove extents, so it should represent the true +			 * size of the inode, so reset the in memory size and +			 * delete our orphan entry. +			 */ +			trans = btrfs_join_transaction(root); +			if (IS_ERR(trans)) { +				btrfs_orphan_del(NULL, inode); +				return ret; +			} +			i_size_write(inode, BTRFS_I(inode)->disk_i_size); +			err = btrfs_orphan_del(trans, inode); +			if (err) +				btrfs_abort_transaction(trans, root, err); +			btrfs_end_transaction(trans, root);  		} -		nr = trans->blocks_used; -		btrfs_end_transaction(trans, root); -		btrfs_btree_balance_dirty(root, nr); -		return 0;  	} -	/* -	 * We're truncating a file that used to have good data down to -	 * zero. Make sure it gets into the ordered flush list so that -	 * any new writes get down to disk quickly. -	 */ -	if (attr->ia_size == 0) -		BTRFS_I(inode)->ordered_data_close = 1; - -	/* we don't support swapfiles, so vmtruncate shouldn't fail */ -	ret = vmtruncate(inode, attr->ia_size); -	BUG_ON(ret); - -	return 0; +	return ret;  }  static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)  {  	struct inode *inode = dentry->d_inode; +	struct btrfs_root *root = BTRFS_I(inode)->root;  	int err; +	if (btrfs_root_readonly(root)) +		return -EROFS; +  	err = inode_change_ok(inode, attr);  	if (err)  		return err;  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { -		err = btrfs_setattr_size(inode, attr); +		err = btrfs_setsize(inode, attr);  		if (err)  			return err;  	}  	if (attr->ia_valid) {  		setattr_copy(inode, attr); -		mark_inode_dirty(inode); +		inode_inc_iversion(inode); +		err = btrfs_dirty_inode(inode); -		if (attr->ia_valid & ATTR_MODE) -			err = btrfs_acl_chmod(inode); +		if (!err && attr->ia_valid & ATTR_MODE) +			err = posix_acl_chmod(inode, inode->i_mode);  	}  	return err;  } +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; +	struct rb_node *node; + +	ASSERT(inode->i_state & I_FREEING); +	truncate_inode_pages_final(&inode->i_data); + +	write_lock(&map_tree->lock); +	while (!RB_EMPTY_ROOT(&map_tree->map)) { +		struct extent_map *em; + +		node = rb_first(&map_tree->map); +		em = rb_entry(node, struct extent_map, rb_node); +		clear_bit(EXTENT_FLAG_PINNED, &em->flags); +		clear_bit(EXTENT_FLAG_LOGGING, &em->flags); +		remove_extent_mapping(map_tree, em); +		free_extent_map(em); +	} +	write_unlock(&map_tree->lock); + +	spin_lock(&io_tree->lock); +	while (!RB_EMPTY_ROOT(&io_tree->state)) { +		struct extent_state *state; +		struct extent_state *cached_state = NULL; + +		node = rb_first(&io_tree->state); +		state = rb_entry(node, struct extent_state, rb_node); +		atomic_inc(&state->refs); +		spin_unlock(&io_tree->lock); + +		lock_extent_bits(io_tree, state->start, state->end, +				 0, &cached_state); +		clear_extent_bit(io_tree, state->start, state->end, +				 EXTENT_LOCKED | EXTENT_DIRTY | +				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | +				 EXTENT_DEFRAG, 1, 1, +				 &cached_state, GFP_NOFS); +		free_extent_state(state); + +		spin_lock(&io_tree->lock); +	} +	spin_unlock(&io_tree->lock); +} +  void btrfs_evict_inode(struct inode *inode)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	unsigned long nr; +	struct btrfs_block_rsv *rsv, *global_rsv; +	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);  	int ret; -	truncate_inode_pages(&inode->i_data, 0); -	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || -			       root == root->fs_info->tree_root)) +	trace_btrfs_inode_evict(inode); + +	evict_inode_truncate_pages(inode); + +	if (inode->i_nlink && +	    ((btrfs_root_refs(&root->root_item) != 0 && +	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || +	     btrfs_is_free_space_inode(inode)))  		goto no_delete;  	if (is_bad_inode(inode)) { @@ -3714,53 +4715,103 @@ void btrfs_evict_inode(struct inode *inode)  	btrfs_wait_ordered_range(inode, 0, (u64)-1);  	if (root->fs_info->log_root_recovering) { -		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); +		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +				 &BTRFS_I(inode)->runtime_flags));  		goto no_delete;  	}  	if (inode->i_nlink > 0) { -		BUG_ON(btrfs_root_refs(&root->root_item) != 0); +		BUG_ON(btrfs_root_refs(&root->root_item) != 0 && +		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);  		goto no_delete;  	} +	ret = btrfs_commit_inode_delayed_inode(inode); +	if (ret) { +		btrfs_orphan_del(NULL, inode); +		goto no_delete; +	} + +	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); +	if (!rsv) { +		btrfs_orphan_del(NULL, inode); +		goto no_delete; +	} +	rsv->size = min_size; +	rsv->failfast = 1; +	global_rsv = &root->fs_info->global_block_rsv; +  	btrfs_i_size_write(inode, 0); +	/* +	 * This is a bit simpler than btrfs_truncate since we've already +	 * reserved our space for our orphan item in the unlink, so we just +	 * need to reserve some slack space in case we add bytes and update +	 * inode item when doing the truncate. +	 */  	while (1) { -		trans = btrfs_start_transaction(root, 0); -		BUG_ON(IS_ERR(trans)); -		btrfs_set_trans_block_group(trans, inode); -		trans->block_rsv = root->orphan_block_rsv; +		ret = btrfs_block_rsv_refill(root, rsv, min_size, +					     BTRFS_RESERVE_FLUSH_LIMIT); + +		/* +		 * Try and steal from the global reserve since we will +		 * likely not use this space anyway, we want to try as +		 * hard as possible to get this to work. +		 */ +		if (ret) +			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); -		ret = btrfs_block_rsv_check(trans, root, -					    root->orphan_block_rsv, 0, 5);  		if (ret) { -			BUG_ON(ret != -EAGAIN); -			ret = btrfs_commit_transaction(trans, root); -			BUG_ON(ret); -			continue; +			btrfs_warn(root->fs_info, +				"Could not get space for a delete, will truncate on mount %d", +				ret); +			btrfs_orphan_del(NULL, inode); +			btrfs_free_block_rsv(root, rsv); +			goto no_delete; +		} + +		trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			btrfs_orphan_del(NULL, inode); +			btrfs_free_block_rsv(root, rsv); +			goto no_delete;  		} +		trans->block_rsv = rsv; +  		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); -		if (ret != -EAGAIN) +		if (ret != -ENOSPC)  			break; -		nr = trans->blocks_used; +		trans->block_rsv = &root->fs_info->trans_block_rsv;  		btrfs_end_transaction(trans, root);  		trans = NULL; -		btrfs_btree_balance_dirty(root, nr); - +		btrfs_btree_balance_dirty(root);  	} +	btrfs_free_block_rsv(root, rsv); + +	/* +	 * Errors here aren't a big deal, it just means we leave orphan items +	 * in the tree.  They will be cleaned up on the next mount. +	 */  	if (ret == 0) { -		ret = btrfs_orphan_del(trans, inode); -		BUG_ON(ret); +		trans->block_rsv = root->orphan_block_rsv; +		btrfs_orphan_del(trans, inode); +	} else { +		btrfs_orphan_del(NULL, inode);  	} -	nr = trans->blocks_used; +	trans->block_rsv = &root->fs_info->trans_block_rsv; +	if (!(root == root->fs_info->tree_root || +	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) +		btrfs_return_ino(root, btrfs_ino(inode)); +  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  no_delete: -	end_writeback(inode); +	btrfs_remove_delayed_node(inode); +	clear_inode(inode);  	return;  } @@ -3779,14 +4830,15 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,  	int ret = 0;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return -ENOMEM; -	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, +	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,  				    namelen, 0);  	if (IS_ERR(di))  		ret = PTR_ERR(di); -	if (!di || IS_ERR(di)) +	if (IS_ERR_OR_NULL(di))  		goto out_err;  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); @@ -3823,9 +4875,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,  	}  	err = -ENOENT; -	ret = btrfs_find_root_ref(root->fs_info->tree_root, path, -				  BTRFS_I(dir)->root->root_key.objectid, -				  location->objectid); +	ret = btrfs_find_item(root->fs_info->tree_root, path, +				BTRFS_I(dir)->root->root_key.objectid, +				location->objectid, BTRFS_ROOT_REF_KEY, NULL);  	if (ret) {  		if (ret < 0)  			err = ret; @@ -3834,7 +4886,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,  	leaf = path->nodes[0];  	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); -	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || +	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||  	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)  		goto out; @@ -3844,7 +4896,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,  	if (ret)  		goto out; -	btrfs_release_path(root->fs_info->tree_root, path); +	btrfs_release_path(path);  	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);  	if (IS_ERR(new_root)) { @@ -3852,11 +4904,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,  		goto out;  	} -	if (btrfs_root_refs(&new_root->root_item) == 0) { -		err = -ENOENT; -		goto out; -	} -  	*sub_root = new_root;  	location->objectid = btrfs_root_dirid(&new_root->root_item);  	location->type = BTRFS_INODE_ITEM_KEY; @@ -3873,33 +4920,33 @@ static void inode_tree_add(struct inode *inode)  	struct btrfs_inode *entry;  	struct rb_node **p;  	struct rb_node *parent; -again: -	p = &root->inode_tree.rb_node; -	parent = NULL; +	struct rb_node *new = &BTRFS_I(inode)->rb_node; +	u64 ino = btrfs_ino(inode);  	if (inode_unhashed(inode))  		return; - +	parent = NULL;  	spin_lock(&root->inode_lock); +	p = &root->inode_tree.rb_node;  	while (*p) {  		parent = *p;  		entry = rb_entry(parent, struct btrfs_inode, rb_node); -		if (inode->i_ino < entry->vfs_inode.i_ino) +		if (ino < btrfs_ino(&entry->vfs_inode))  			p = &parent->rb_left; -		else if (inode->i_ino > entry->vfs_inode.i_ino) +		else if (ino > btrfs_ino(&entry->vfs_inode))  			p = &parent->rb_right;  		else {  			WARN_ON(!(entry->vfs_inode.i_state &  				  (I_WILL_FREE | I_FREEING))); -			rb_erase(parent, &root->inode_tree); +			rb_replace_node(parent, new, &root->inode_tree);  			RB_CLEAR_NODE(parent);  			spin_unlock(&root->inode_lock); -			goto again; +			return;  		}  	} -	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); -	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); +	rb_link_node(new, parent, p); +	rb_insert_color(new, &root->inode_tree);  	spin_unlock(&root->inode_lock);  } @@ -3916,14 +4963,7 @@ static void inode_tree_del(struct inode *inode)  	}  	spin_unlock(&root->inode_lock); -	/* -	 * Free space cache has inodes in the tree root, but the tree root has a -	 * root_refs of 0, so this could end up dropping the tree root as a -	 * snapshot, so we need the extra !root->fs_info->tree_root check to -	 * make sure we don't drop it. -	 */ -	if (empty && btrfs_root_refs(&root->root_item) == 0 && -	    root != root->fs_info->tree_root) { +	if (empty && btrfs_root_refs(&root->root_item) == 0) {  		synchronize_srcu(&root->fs_info->subvol_srcu);  		spin_lock(&root->inode_lock);  		empty = RB_EMPTY_ROOT(&root->inode_tree); @@ -3933,7 +4973,7 @@ static void inode_tree_del(struct inode *inode)  	}  } -int btrfs_invalidate_inodes(struct btrfs_root *root) +void btrfs_invalidate_inodes(struct btrfs_root *root)  {  	struct rb_node *node;  	struct rb_node *prev; @@ -3941,7 +4981,8 @@ int btrfs_invalidate_inodes(struct btrfs_root *root)  	struct inode *inode;  	u64 objectid = 0; -	WARN_ON(btrfs_root_refs(&root->root_item) != 0); +	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) +		WARN_ON(btrfs_root_refs(&root->root_item) != 0);  	spin_lock(&root->inode_lock);  again: @@ -3951,9 +4992,9 @@ again:  		prev = node;  		entry = rb_entry(node, struct btrfs_inode, rb_node); -		if (objectid < entry->vfs_inode.i_ino) +		if (objectid < btrfs_ino(&entry->vfs_inode))  			node = node->rb_left; -		else if (objectid > entry->vfs_inode.i_ino) +		else if (objectid > btrfs_ino(&entry->vfs_inode))  			node = node->rb_right;  		else  			break; @@ -3961,7 +5002,7 @@ again:  	if (!node) {  		while (prev) {  			entry = rb_entry(prev, struct btrfs_inode, rb_node); -			if (objectid <= entry->vfs_inode.i_ino) { +			if (objectid <= btrfs_ino(&entry->vfs_inode)) {  				node = prev;  				break;  			} @@ -3970,7 +5011,7 @@ again:  	}  	while (node) {  		entry = rb_entry(node, struct btrfs_inode, rb_node); -		objectid = entry->vfs_inode.i_ino + 1; +		objectid = btrfs_ino(&entry->vfs_inode) + 1;  		inode = igrab(&entry->vfs_inode);  		if (inode) {  			spin_unlock(&root->inode_lock); @@ -3993,35 +5034,37 @@ again:  		node = rb_next(node);  	}  	spin_unlock(&root->inode_lock); -	return 0;  }  static int btrfs_init_locked_inode(struct inode *inode, void *p)  {  	struct btrfs_iget_args *args = p; -	inode->i_ino = args->ino; +	inode->i_ino = args->location->objectid; +	memcpy(&BTRFS_I(inode)->location, args->location, +	       sizeof(*args->location));  	BTRFS_I(inode)->root = args->root; -	btrfs_set_inode_space_info(args->root, inode);  	return 0;  }  static int btrfs_find_actor(struct inode *inode, void *opaque)  {  	struct btrfs_iget_args *args = opaque; -	return args->ino == inode->i_ino && +	return args->location->objectid == BTRFS_I(inode)->location.objectid &&  		args->root == BTRFS_I(inode)->root;  }  static struct inode *btrfs_iget_locked(struct super_block *s, -				       u64 objectid, +				       struct btrfs_key *location,  				       struct btrfs_root *root)  {  	struct inode *inode;  	struct btrfs_iget_args args; -	args.ino = objectid; +	unsigned long hashval = btrfs_inode_hash(location->objectid, root); + +	args.location = location;  	args.root = root; -	inode = iget5_locked(s, objectid, btrfs_find_actor, +	inode = iget5_locked(s, hashval, btrfs_find_actor,  			     btrfs_init_locked_inode,  			     (void *)&args);  	return inode; @@ -4035,19 +5078,22 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,  {  	struct inode *inode; -	inode = btrfs_iget_locked(s, location->objectid, root); +	inode = btrfs_iget_locked(s, location, root);  	if (!inode)  		return ERR_PTR(-ENOMEM);  	if (inode->i_state & I_NEW) { -		BTRFS_I(inode)->root = root; -		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));  		btrfs_read_locked_inode(inode); - -		inode_tree_add(inode); -		unlock_new_inode(inode); -		if (new) -			*new = 1; +		if (!is_bad_inode(inode)) { +			inode_tree_add(inode); +			unlock_new_inode(inode); +			if (new) +				*new = 1; +		} else { +			unlock_new_inode(inode); +			iput(inode); +			inode = ERR_PTR(-ESTALE); +		}  	}  	return inode; @@ -4064,10 +5110,10 @@ static struct inode *new_simple_dir(struct super_block *s,  	BTRFS_I(inode)->root = root;  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); -	BTRFS_I(inode)->dummy_inode = 1; +	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);  	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; -	inode->i_op = &simple_dir_inode_operations; +	inode->i_op = &btrfs_dir_ro_inode_operations;  	inode->i_fop = &simple_dir_operations;  	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -4082,20 +5128,17 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  	struct btrfs_root *sub_root = root;  	struct btrfs_key location;  	int index; -	int ret; - -	dentry->d_op = &btrfs_dentry_operations; +	int ret = 0;  	if (dentry->d_name.len > BTRFS_NAME_LEN)  		return ERR_PTR(-ENAMETOOLONG);  	ret = btrfs_inode_by_name(dir, dentry, &location); -  	if (ret < 0)  		return ERR_PTR(ret);  	if (location.objectid == 0) -		return NULL; +		return ERR_PTR(-ENOENT);  	if (location.type == BTRFS_INODE_ITEM_KEY) {  		inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -4117,62 +5160,78 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  	}  	srcu_read_unlock(&root->fs_info->subvol_srcu, index); -	if (root != sub_root) { +	if (!IS_ERR(inode) && root != sub_root) {  		down_read(&root->fs_info->cleanup_work_sem);  		if (!(inode->i_sb->s_flags & MS_RDONLY)) -			btrfs_orphan_cleanup(sub_root); +			ret = btrfs_orphan_cleanup(sub_root);  		up_read(&root->fs_info->cleanup_work_sem); +		if (ret) { +			iput(inode); +			inode = ERR_PTR(ret); +		}  	}  	return inode;  } -static int btrfs_dentry_delete(struct dentry *dentry) +static int btrfs_dentry_delete(const struct dentry *dentry)  {  	struct btrfs_root *root; +	struct inode *inode = dentry->d_inode; -	if (!dentry->d_inode && !IS_ROOT(dentry)) -		dentry = dentry->d_parent; +	if (!inode && !IS_ROOT(dentry)) +		inode = dentry->d_parent->d_inode; -	if (dentry->d_inode) { -		root = BTRFS_I(dentry->d_inode)->root; +	if (inode) { +		root = BTRFS_I(inode)->root;  		if (btrfs_root_refs(&root->root_item) == 0)  			return 1; + +		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) +			return 1;  	}  	return 0;  } +static void btrfs_dentry_release(struct dentry *dentry) +{ +	kfree(dentry->d_fsdata); +} +  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, -				   struct nameidata *nd) +				   unsigned int flags)  {  	struct inode *inode;  	inode = btrfs_lookup_dentry(dir, dentry); -	if (IS_ERR(inode)) -		return ERR_CAST(inode); +	if (IS_ERR(inode)) { +		if (PTR_ERR(inode) == -ENOENT) +			inode = NULL; +		else +			return ERR_CAST(inode); +	} -	return d_splice_alias(inode, dentry); +	return d_materialise_unique(dentry, inode);  } -static unsigned char btrfs_filetype_table[] = { +unsigned char btrfs_filetype_table[] = {  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK  }; -static int btrfs_real_readdir(struct file *filp, void *dirent, -			      filldir_t filldir) +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_item *item;  	struct btrfs_dir_item *di;  	struct btrfs_key key;  	struct btrfs_key found_key;  	struct btrfs_path *path; +	struct list_head ins_list; +	struct list_head del_list;  	int ret; -	u32 nritems;  	struct extent_buffer *leaf;  	int slot; -	int advance;  	unsigned char d_type;  	int over = 0;  	u32 di_cur; @@ -4182,71 +5241,63 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,  	char tmp_name[32];  	char *name_ptr;  	int name_len; +	int is_curr = 0;	/* ctx->pos points to the current index? */  	/* FIXME, use a real flag for deciding about the key type */  	if (root->fs_info->tree_root == root)  		key_type = BTRFS_DIR_ITEM_KEY; -	/* special case for "." */ -	if (filp->f_pos == 0) { -		over = filldir(dirent, ".", 1, -			       1, inode->i_ino, -			       DT_DIR); -		if (over) -			return 0; -		filp->f_pos = 1; -	} -	/* special case for .., just use the back ref */ -	if (filp->f_pos == 1) { -		u64 pino = parent_ino(filp->f_path.dentry); -		over = filldir(dirent, "..", 2, -			       2, pino, DT_DIR); -		if (over) -			return 0; -		filp->f_pos = 2; -	} +	if (!dir_emit_dots(file, ctx)) +		return 0; +  	path = btrfs_alloc_path(); -	path->reada = 2; +	if (!path) +		return -ENOMEM; + +	path->reada = 1; + +	if (key_type == BTRFS_DIR_INDEX_KEY) { +		INIT_LIST_HEAD(&ins_list); +		INIT_LIST_HEAD(&del_list); +		btrfs_get_delayed_items(inode, &ins_list, &del_list); +	}  	btrfs_set_key_type(&key, key_type); -	key.offset = filp->f_pos; -	key.objectid = inode->i_ino; +	key.offset = ctx->pos; +	key.objectid = btrfs_ino(inode);  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	if (ret < 0)  		goto err; -	advance = 0;  	while (1) {  		leaf = path->nodes[0]; -		nritems = btrfs_header_nritems(leaf);  		slot = path->slots[0]; -		if (advance || slot >= nritems) { -			if (slot >= nritems - 1) { -				ret = btrfs_next_leaf(root, path); -				if (ret) -					break; -				leaf = path->nodes[0]; -				nritems = btrfs_header_nritems(leaf); -				slot = path->slots[0]; -			} else { -				slot++; -				path->slots[0]++; -			} +		if (slot >= btrfs_header_nritems(leaf)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				goto err; +			else if (ret > 0) +				break; +			continue;  		} -		advance = 1; -		item = btrfs_item_nr(leaf, slot); +		item = btrfs_item_nr(slot);  		btrfs_item_key_to_cpu(leaf, &found_key, slot);  		if (found_key.objectid != key.objectid)  			break;  		if (btrfs_key_type(&found_key) != key_type)  			break; -		if (found_key.offset < filp->f_pos) -			continue; +		if (found_key.offset < ctx->pos) +			goto next; +		if (key_type == BTRFS_DIR_INDEX_KEY && +		    btrfs_should_delete_dir_index(&del_list, +						  found_key.offset)) +			goto next; -		filp->f_pos = found_key.offset; +		ctx->pos = found_key.offset; +		is_curr = 1;  		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);  		di_cur = 0; @@ -4255,6 +5306,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,  		while (di_cur < di_total) {  			struct btrfs_key location; +			if (verify_dir_item(root, leaf, di)) +				break; +  			name_len = btrfs_dir_name_len(leaf, di);  			if (name_len <= sizeof(tmp_name)) {  				name_ptr = tmp_name; @@ -4271,17 +5325,23 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,  			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];  			btrfs_dir_item_key_to_cpu(leaf, di, &location); +  			/* is this a reference to our own snapshot? If so -			 * skip it +			 * skip it. +			 * +			 * In contrast to old kernels, we insert the snapshot's +			 * dir item and dir index after it has been created, so +			 * we won't find a reference to our own snapshot. We +			 * still keep the following code for backward +			 * compatibility.  			 */  			if (location.type == BTRFS_ROOT_ITEM_KEY &&  			    location.objectid == root->root_key.objectid) {  				over = 0;  				goto skip;  			} -			over = filldir(dirent, name_ptr, name_len, -				       found_key.offset, location.objectid, -				       d_type); +			over = !dir_emit(ctx, name_ptr, name_len, +				       location.objectid, d_type);  skip:  			if (name_ptr != tmp_name) @@ -4294,20 +5354,49 @@ skip:  			di_cur += di_len;  			di = (struct btrfs_dir_item *)((char *)di + di_len);  		} +next: +		path->slots[0]++; +	} + +	if (key_type == BTRFS_DIR_INDEX_KEY) { +		if (is_curr) +			ctx->pos++; +		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); +		if (ret) +			goto nopos;  	}  	/* Reached end of directory/root. Bump pos past the last item. */ -	if (key_type == BTRFS_DIR_INDEX_KEY) -		/* -		 * 32-bit glibc will use getdents64, but then strtol - -		 * so the last number we can serve is this. -		 */ -		filp->f_pos = 0x7fffffff; -	else -		filp->f_pos++; +	ctx->pos++; + +	/* +	 * Stop new entries from being returned after we return the last +	 * entry. +	 * +	 * New directory entries are assigned a strictly increasing +	 * offset.  This means that new entries created during readdir +	 * are *guaranteed* to be seen in the future by that readdir. +	 * This has broken buggy programs which operate on names as +	 * they're returned by readdir.  Until we re-use freed offsets +	 * we have this hack to stop new entries from being returned +	 * under the assumption that they'll never reach this huge +	 * offset. +	 * +	 * This is being careful not to overflow 32bit loff_t unless the +	 * last entry requires it because doing so has broken 32bit apps +	 * in the past. +	 */ +	if (key_type == BTRFS_DIR_INDEX_KEY) { +		if (ctx->pos >= INT_MAX) +			ctx->pos = LLONG_MAX; +		else +			ctx->pos = INT_MAX; +	}  nopos:  	ret = 0;  err: +	if (key_type == BTRFS_DIR_INDEX_KEY) +		btrfs_put_delayed_items(&ins_list, &del_list);  	btrfs_free_path(path);  	return ret;  } @@ -4319,22 +5408,20 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)  	int ret = 0;  	bool nolock = false; -	if (BTRFS_I(inode)->dummy_inode) +	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))  		return 0; -	smp_mb(); -	nolock = (root->fs_info->closing && root == root->fs_info->tree_root); +	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) +		nolock = true;  	if (wbc->sync_mode == WB_SYNC_ALL) {  		if (nolock) -			trans = btrfs_join_transaction_nolock(root, 1); +			trans = btrfs_join_transaction_nolock(root);  		else -			trans = btrfs_join_transaction(root, 1); -		btrfs_set_trans_block_group(trans, inode); -		if (nolock) -			ret = btrfs_end_transaction_nolock(trans, root); -		else -			ret = btrfs_commit_transaction(trans, root); +			trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) +			return PTR_ERR(trans); +		ret = btrfs_commit_transaction(trans, root);  	}  	return ret;  } @@ -4345,43 +5432,57 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)   * FIXME, needs more benchmarking...there are no reasons other than performance   * to keep or drop this code.   */ -void btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans;  	int ret; -	if (BTRFS_I(inode)->dummy_inode) -		return; +	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) +		return 0; -	trans = btrfs_join_transaction(root, 1); -	btrfs_set_trans_block_group(trans, inode); +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) +		return PTR_ERR(trans);  	ret = btrfs_update_inode(trans, root, inode);  	if (ret && ret == -ENOSPC) {  		/* whoops, lets try again with the full transaction */  		btrfs_end_transaction(trans, root);  		trans = btrfs_start_transaction(root, 1); -		if (IS_ERR(trans)) { -			if (printk_ratelimit()) { -				printk(KERN_ERR "btrfs: fail to " -				       "dirty  inode %lu error %ld\n", -				       inode->i_ino, PTR_ERR(trans)); -			} -			return; -		} -		btrfs_set_trans_block_group(trans, inode); +		if (IS_ERR(trans)) +			return PTR_ERR(trans);  		ret = btrfs_update_inode(trans, root, inode); -		if (ret) { -			if (printk_ratelimit()) { -				printk(KERN_ERR "btrfs: fail to " -				       "dirty  inode %lu error %d\n", -				       inode->i_ino, ret); -			} -		}  	}  	btrfs_end_transaction(trans, root); +	if (BTRFS_I(inode)->delayed_node) +		btrfs_balance_delayed_items(root); + +	return ret; +} + +/* + * This is a copy of file_update_time.  We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +static int btrfs_update_time(struct inode *inode, struct timespec *now, +			     int flags) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; + +	if (btrfs_root_readonly(root)) +		return -EROFS; + +	if (flags & S_VERSION) +		inode_inc_iversion(inode); +	if (flags & S_CTIME) +		inode->i_ctime = *now; +	if (flags & S_MTIME) +		inode->i_mtime = *now; +	if (flags & S_ATIME) +		inode->i_atime = *now; +	return btrfs_dirty_inode(inode);  }  /* @@ -4397,7 +5498,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)  	struct extent_buffer *leaf;  	int ret; -	key.objectid = inode->i_ino; +	key.objectid = btrfs_ino(inode);  	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);  	key.offset = (u64)-1; @@ -4429,7 +5530,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)  	leaf = path->nodes[0];  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -	if (found_key.objectid != inode->i_ino || +	if (found_key.objectid != btrfs_ino(inode) ||  	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {  		BTRFS_I(inode)->index_cnt = 2;  		goto out; @@ -4450,9 +5551,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)  	int ret = 0;  	if (BTRFS_I(dir)->index_cnt == (u64)-1) { -		ret = btrfs_set_inode_index_count(dir); -		if (ret) -			return ret; +		ret = btrfs_inode_delayed_dir_index_count(dir); +		if (ret) { +			ret = btrfs_set_inode_index_count(dir); +			if (ret) +				return ret; +		}  	}  	*index = BTRFS_I(dir)->index_cnt; @@ -4466,7 +5570,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  				     struct inode *dir,  				     const char *name, int name_len,  				     u64 ref_objectid, u64 objectid, -				     u64 alloc_hint, int mode, u64 *index) +				     umode_t mode, u64 *index)  {  	struct inode *inode;  	struct btrfs_inode_item *inode_item; @@ -4475,23 +5579,37 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	struct btrfs_inode_ref *ref;  	struct btrfs_key key[2];  	u32 sizes[2]; +	int nitems = name ? 2 : 1;  	unsigned long ptr;  	int ret; -	int owner;  	path = btrfs_alloc_path(); -	BUG_ON(!path); +	if (!path) +		return ERR_PTR(-ENOMEM);  	inode = new_inode(root->fs_info->sb); -	if (!inode) +	if (!inode) { +		btrfs_free_path(path);  		return ERR_PTR(-ENOMEM); +	} + +	/* +	 * we have to initialize this early, so we can reclaim the inode +	 * number if we fail afterwards in this function. +	 */ +	inode->i_ino = objectid; + +	if (dir && name) { +		trace_btrfs_inode_request(dir); -	if (dir) {  		ret = btrfs_set_inode_index(dir, index);  		if (ret) { +			btrfs_free_path(path);  			iput(inode);  			return ERR_PTR(ret);  		} +	} else if (dir) { +		*index = 0;  	}  	/*  	 * index_cnt is ignored for everything but a dir, @@ -4499,47 +5617,61 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	 * number  	 */  	BTRFS_I(inode)->index_cnt = 2; +	BTRFS_I(inode)->dir_index = *index;  	BTRFS_I(inode)->root = root;  	BTRFS_I(inode)->generation = trans->transid; -	btrfs_set_inode_space_info(root, inode); +	inode->i_generation = BTRFS_I(inode)->generation; -	if (mode & S_IFDIR) -		owner = 0; -	else -		owner = 1; -	BTRFS_I(inode)->block_group = -			btrfs_find_block_group(root, 0, alloc_hint, owner); +	/* +	 * We could have gotten an inode number from somebody who was fsynced +	 * and then removed in this same transaction, so let's just set full +	 * sync since it will be a full sync anyway and this will blow away the +	 * old info in the log. +	 */ +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);  	key[0].objectid = objectid;  	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);  	key[0].offset = 0; -	key[1].objectid = objectid; -	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); -	key[1].offset = ref_objectid; -  	sizes[0] = sizeof(struct btrfs_inode_item); -	sizes[1] = name_len + sizeof(*ref); + +	if (name) { +		/* +		 * Start new inodes with an inode_ref. This is slightly more +		 * efficient for small numbers of hard links since they will +		 * be packed into one item. Extended refs will kick in if we +		 * add more hard links than can fit in the ref item. +		 */ +		key[1].objectid = objectid; +		btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); +		key[1].offset = ref_objectid; + +		sizes[1] = name_len + sizeof(*ref); +	}  	path->leave_spinning = 1; -	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); +	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);  	if (ret != 0)  		goto fail;  	inode_init_owner(inode, dir, mode); -	inode->i_ino = objectid;  	inode_set_bytes(inode, 0);  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],  				  struct btrfs_inode_item); +	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, +			     sizeof(*inode_item));  	fill_inode_item(trans, path->nodes[0], inode_item, inode); -	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, -			     struct btrfs_inode_ref); -	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); -	btrfs_set_inode_ref_index(path->nodes[0], ref, *index); -	ptr = (unsigned long)(ref + 1); -	write_extent_buffer(path->nodes[0], name, ptr, name_len); +	if (name) { +		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, +				     struct btrfs_inode_ref); +		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); +		btrfs_set_inode_ref_index(path->nodes[0], ref, *index); +		ptr = (unsigned long)(ref + 1); +		write_extent_buffer(path->nodes[0], name, ptr, name_len); +	}  	btrfs_mark_buffer_dirty(path->nodes[0]);  	btrfs_free_path(path); @@ -4551,18 +5683,31 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	btrfs_inherit_iflags(inode, dir); -	if ((mode & S_IFREG)) { +	if (S_ISREG(mode)) {  		if (btrfs_test_opt(root, NODATASUM))  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;  		if (btrfs_test_opt(root, NODATACOW)) -			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; +			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | +				BTRFS_INODE_NODATASUM;  	} -	insert_inode_hash(inode); +	btrfs_insert_inode_hash(inode);  	inode_tree_add(inode); + +	trace_btrfs_inode_new(inode); +	btrfs_set_inode_last_trans(trans, inode); + +	btrfs_update_root_times(trans, root); + +	ret = btrfs_inode_inherit_props(trans, inode, dir); +	if (ret) +		btrfs_err(root->fs_info, +			  "error inheriting props for ino %llu (root %llu): %d", +			  btrfs_ino(inode), root->root_key.objectid, ret); +  	return inode;  fail: -	if (dir) +	if (dir && name)  		BTRFS_I(dir)->index_cnt--;  	btrfs_free_path(path);  	iput(inode); @@ -4587,58 +5732,81 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  	int ret = 0;  	struct btrfs_key key;  	struct btrfs_root *root = BTRFS_I(parent_inode)->root; +	u64 ino = btrfs_ino(inode); +	u64 parent_ino = btrfs_ino(parent_inode); -	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { +	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {  		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));  	} else { -		key.objectid = inode->i_ino; +		key.objectid = ino;  		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);  		key.offset = 0;  	} -	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { +	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {  		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,  					 key.objectid, root->root_key.objectid, -					 parent_inode->i_ino, -					 index, name, name_len); +					 parent_ino, index, name, name_len);  	} else if (add_backref) { -		ret = btrfs_insert_inode_ref(trans, root, -					     name, name_len, inode->i_ino, -					     parent_inode->i_ino, index); +		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, +					     parent_ino, index);  	} -	if (ret == 0) { -		ret = btrfs_insert_dir_item(trans, root, name, name_len, -					    parent_inode->i_ino, &key, -					    btrfs_inode_type(inode), index); -		BUG_ON(ret); +	/* Nothing to clean up yet */ +	if (ret) +		return ret; + +	ret = btrfs_insert_dir_item(trans, root, name, name_len, +				    parent_inode, &key, +				    btrfs_inode_type(inode), index); +	if (ret == -EEXIST || ret == -EOVERFLOW) +		goto fail_dir_item; +	else if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		return ret; +	} + +	btrfs_i_size_write(parent_inode, parent_inode->i_size + +			   name_len * 2); +	inode_inc_iversion(parent_inode); +	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; +	ret = btrfs_update_inode(trans, root, parent_inode); +	if (ret) +		btrfs_abort_transaction(trans, root, ret); +	return ret; + +fail_dir_item: +	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { +		u64 local_index; +		int err; +		err = btrfs_del_root_ref(trans, root->fs_info->tree_root, +				 key.objectid, root->root_key.objectid, +				 parent_ino, &local_index, name, name_len); + +	} else if (add_backref) { +		u64 local_index; +		int err; -		btrfs_i_size_write(parent_inode, parent_inode->i_size + -				   name_len * 2); -		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; -		ret = btrfs_update_inode(trans, root, parent_inode); +		err = btrfs_del_inode_ref(trans, root, name, name_len, +					  ino, parent_ino, &local_index);  	}  	return ret;  }  static int btrfs_add_nondir(struct btrfs_trans_handle *trans, -			    struct dentry *dentry, struct inode *inode, -			    int backref, u64 index) +			    struct inode *dir, struct dentry *dentry, +			    struct inode *inode, int backref, u64 index)  { -	int err = btrfs_add_link(trans, dentry->d_parent->d_inode, -				 inode, dentry->d_name.name, -				 dentry->d_name.len, backref, index); -	if (!err) { -		d_instantiate(dentry, inode); -		return 0; -	} +	int err = btrfs_add_link(trans, dir, inode, +				 dentry->d_name.name, dentry->d_name.len, +				 backref, index);  	if (err > 0)  		err = -EEXIST;  	return err;  }  static int btrfs_mknod(struct inode *dir, struct dentry *dentry, -			int mode, dev_t rdev) +			umode_t mode, dev_t rdev)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4646,16 +5814,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  	int err;  	int drop_inode = 0;  	u64 objectid; -	unsigned long nr = 0;  	u64 index = 0;  	if (!new_valid_dev(rdev))  		return -EINVAL; -	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); -	if (err) -		return err; -  	/*  	 * 2 for inode item and ref  	 * 2 for dir items @@ -4665,37 +5828,44 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); +	err = btrfs_find_free_ino(root, &objectid); +	if (err) +		goto out_unlock;  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -				dentry->d_name.len, -				dentry->d_parent->d_inode->i_ino, objectid, -				BTRFS_I(dir)->block_group, mode, &index); -	err = PTR_ERR(inode); -	if (IS_ERR(inode)) +				dentry->d_name.len, btrfs_ino(dir), objectid, +				mode, &index); +	if (IS_ERR(inode)) { +		err = PTR_ERR(inode);  		goto out_unlock; +	} -	err = btrfs_init_inode_security(trans, inode, dir); +	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);  	if (err) {  		drop_inode = 1;  		goto out_unlock;  	} -	btrfs_set_trans_block_group(trans, inode); -	err = btrfs_add_nondir(trans, dentry, inode, 0, index); +	/* +	* If the active LSM wants to access the inode during +	* d_instantiate it needs these. Smack checks to see +	* if the filesystem supports xattrs by looking at the +	* ops vector. +	*/ + +	inode->i_op = &btrfs_special_inode_operations; +	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);  	if (err)  		drop_inode = 1;  	else { -		inode->i_op = &btrfs_special_inode_operations;  		init_special_inode(inode, inode->i_mode, rdev);  		btrfs_update_inode(trans, root, inode); +		d_instantiate(dentry, inode);  	} -	btrfs_update_inode_block_group(trans, inode); -	btrfs_update_inode_block_group(trans, dir);  out_unlock: -	nr = trans->blocks_used; -	btrfs_end_transaction_throttle(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_end_transaction(trans, root); +	btrfs_balance_delayed_items(root); +	btrfs_btree_balance_dirty(root);  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode); @@ -4704,20 +5874,16 @@ out_unlock:  }  static int btrfs_create(struct inode *dir, struct dentry *dentry, -			int mode, struct nameidata *nd) +			umode_t mode, bool excl)  {  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct inode *inode = NULL; -	int drop_inode = 0; +	int drop_inode_on_err = 0;  	int err; -	unsigned long nr = 0;  	u64 objectid;  	u64 index = 0; -	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); -	if (err) -		return err;  	/*  	 * 2 for inode item and ref  	 * 2 for dir items @@ -4727,44 +5893,53 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); +	err = btrfs_find_free_ino(root, &objectid); +	if (err) +		goto out_unlock;  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -				dentry->d_name.len, -				dentry->d_parent->d_inode->i_ino, -				objectid, BTRFS_I(dir)->block_group, mode, -				&index); -	err = PTR_ERR(inode); -	if (IS_ERR(inode)) +				dentry->d_name.len, btrfs_ino(dir), objectid, +				mode, &index); +	if (IS_ERR(inode)) { +		err = PTR_ERR(inode);  		goto out_unlock; +	} +	drop_inode_on_err = 1; -	err = btrfs_init_inode_security(trans, inode, dir); -	if (err) { -		drop_inode = 1; +	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); +	if (err)  		goto out_unlock; -	} -	btrfs_set_trans_block_group(trans, inode); -	err = btrfs_add_nondir(trans, dentry, inode, 0, index); +	err = btrfs_update_inode(trans, root, inode);  	if (err) -		drop_inode = 1; -	else { -		inode->i_mapping->a_ops = &btrfs_aops; -		inode->i_mapping->backing_dev_info = &root->fs_info->bdi; -		inode->i_fop = &btrfs_file_operations; -		inode->i_op = &btrfs_file_inode_operations; -		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -	} -	btrfs_update_inode_block_group(trans, inode); -	btrfs_update_inode_block_group(trans, dir); +		goto out_unlock; + +	/* +	* If the active LSM wants to access the inode during +	* d_instantiate it needs these. Smack checks to see +	* if the filesystem supports xattrs by looking at the +	* ops vector. +	*/ +	inode->i_fop = &btrfs_file_operations; +	inode->i_op = &btrfs_file_inode_operations; + +	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); +	if (err) +		goto out_unlock; + +	inode->i_mapping->a_ops = &btrfs_aops; +	inode->i_mapping->backing_dev_info = &root->fs_info->bdi; +	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +	d_instantiate(dentry, inode); +  out_unlock: -	nr = trans->blocks_used; -	btrfs_end_transaction_throttle(trans, root); -	if (drop_inode) { +	btrfs_end_transaction(trans, root); +	if (err && drop_inode_on_err) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_balance_delayed_items(root); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -4775,59 +5950,73 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct inode *inode = old_dentry->d_inode;  	u64 index; -	unsigned long nr = 0;  	int err;  	int drop_inode = 0; -	if (inode->i_nlink == 0) -		return -ENOENT; -  	/* do not allow sys_link's with other subvols of the same device */  	if (root->objectid != BTRFS_I(inode)->root->objectid) -		return -EPERM; +		return -EXDEV; -	btrfs_inc_nlink(inode); +	if (inode->i_nlink >= BTRFS_LINK_MAX) +		return -EMLINK;  	err = btrfs_set_inode_index(dir, &index);  	if (err)  		goto fail;  	/* -	 * 1 item for inode ref +	 * 2 items for inode and inode ref  	 * 2 items for dir items +	 * 1 item for parent inode  	 */ -	trans = btrfs_start_transaction(root, 3); +	trans = btrfs_start_transaction(root, 5);  	if (IS_ERR(trans)) {  		err = PTR_ERR(trans);  		goto fail;  	} -	btrfs_set_trans_block_group(trans, dir); +	/* There are several dir indexes for this inode, clear the cache. */ +	BTRFS_I(inode)->dir_index = 0ULL; +	inc_nlink(inode); +	inode_inc_iversion(inode); +	inode->i_ctime = CURRENT_TIME;  	ihold(inode); +	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); -	err = btrfs_add_nondir(trans, dentry, inode, 1, index); +	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);  	if (err) {  		drop_inode = 1;  	} else { -		btrfs_update_inode_block_group(trans, dir); +		struct dentry *parent = dentry->d_parent;  		err = btrfs_update_inode(trans, root, inode); -		BUG_ON(err); -		btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); +		if (err) +			goto fail; +		if (inode->i_nlink == 1) { +			/* +			 * If new hard link count is 1, it's a file created +			 * with open(2) O_TMPFILE flag. +			 */ +			err = btrfs_orphan_del(trans, inode); +			if (err) +				goto fail; +		} +		d_instantiate(dentry, inode); +		btrfs_log_new_name(trans, inode, NULL, parent);  	} -	nr = trans->blocks_used; -	btrfs_end_transaction_throttle(trans, root); +	btrfs_end_transaction(trans, root); +	btrfs_balance_delayed_items(root);  fail:  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct inode *inode = NULL;  	struct btrfs_trans_handle *trans; @@ -4836,11 +6025,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)  	int drop_on_err = 0;  	u64 objectid = 0;  	u64 index = 0; -	unsigned long nr = 1; - -	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); -	if (err) -		return err;  	/*  	 * 2 items for inode and ref @@ -4850,13 +6034,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)  	trans = btrfs_start_transaction(root, 5);  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); + +	err = btrfs_find_free_ino(root, &objectid); +	if (err) +		goto out_fail;  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -				dentry->d_name.len, -				dentry->d_parent->d_inode->i_ino, objectid, -				BTRFS_I(dir)->block_group, S_IFDIR | mode, -				&index); +				dentry->d_name.len, btrfs_ino(dir), objectid, +				S_IFDIR | mode, &index);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode);  		goto out_fail; @@ -4864,36 +6049,32 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)  	drop_on_err = 1; -	err = btrfs_init_inode_security(trans, inode, dir); +	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);  	if (err)  		goto out_fail;  	inode->i_op = &btrfs_dir_inode_operations;  	inode->i_fop = &btrfs_dir_file_operations; -	btrfs_set_trans_block_group(trans, inode);  	btrfs_i_size_write(inode, 0);  	err = btrfs_update_inode(trans, root, inode);  	if (err)  		goto out_fail; -	err = btrfs_add_link(trans, dentry->d_parent->d_inode, -				 inode, dentry->d_name.name, -				 dentry->d_name.len, 0, index); +	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, +			     dentry->d_name.len, 0, index);  	if (err)  		goto out_fail;  	d_instantiate(dentry, inode);  	drop_on_err = 0; -	btrfs_update_inode_block_group(trans, inode); -	btrfs_update_inode_block_group(trans, dir);  out_fail: -	nr = trans->blocks_used; -	btrfs_end_transaction_throttle(trans, root); +	btrfs_end_transaction(trans, root);  	if (drop_on_err)  		iput(inode); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_balance_delayed_items(root); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -4917,7 +6098,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,  		em->block_start += start_diff;  		em->block_len -= start_diff;  	} -	return add_extent_mapping(em_tree, em); +	return add_extent_mapping(em_tree, em, 0);  }  static noinline int uncompress_inline(struct btrfs_path *path, @@ -4931,29 +6112,25 @@ static noinline int uncompress_inline(struct btrfs_path *path,  	size_t max_size;  	unsigned long inline_size;  	unsigned long ptr; +	int compress_type;  	WARN_ON(pg_offset != 0); +	compress_type = btrfs_file_extent_compression(leaf, item);  	max_size = btrfs_file_extent_ram_bytes(leaf, item);  	inline_size = btrfs_file_extent_inline_item_len(leaf, -					btrfs_item_nr(leaf, path->slots[0])); +					btrfs_item_nr(path->slots[0]));  	tmp = kmalloc(inline_size, GFP_NOFS); +	if (!tmp) +		return -ENOMEM;  	ptr = btrfs_file_extent_inline_start(item);  	read_extent_buffer(leaf, tmp, ptr, inline_size);  	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); -	ret = btrfs_zlib_decompress(tmp, page, extent_offset, -				    inline_size, max_size); -	if (ret) { -		char *kaddr = kmap_atomic(page, KM_USER0); -		unsigned long copy_size = min_t(u64, -				  PAGE_CACHE_SIZE - pg_offset, -				  max_size - extent_offset); -		memset(kaddr + pg_offset, 0, copy_size); -		kunmap_atomic(kaddr, KM_USER0); -	} +	ret = btrfs_decompress(compress_type, tmp, page, +			       extent_offset, inline_size, max_size);  	kfree(tmp); -	return 0; +	return ret;  }  /* @@ -4971,10 +6148,9 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  {  	int ret;  	int err = 0; -	u64 bytenr;  	u64 extent_start = 0;  	u64 extent_end = 0; -	u64 objectid = inode->i_ino; +	u64 objectid = btrfs_ino(inode);  	u32 found_type;  	struct btrfs_path *path = NULL;  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4985,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_trans_handle *trans = NULL; -	int compressed; +	const bool new_inline = !page || create;  again:  	read_lock(&em_tree->lock); @@ -5002,7 +6178,7 @@ again:  		else  			goto out;  	} -	em = alloc_extent_map(GFP_NOFS); +	em = alloc_extent_map();  	if (!em) {  		err = -ENOMEM;  		goto out; @@ -5015,7 +6191,15 @@ again:  	if (!path) {  		path = btrfs_alloc_path(); -		BUG_ON(!path); +		if (!path) { +			err = -ENOMEM; +			goto out; +		} +		/* +		 * Chances are we'll be called again, so go ahead and do +		 * readahead +		 */ +		path->reada = 1;  	}  	ret = btrfs_lookup_file_extent(trans, root, path, @@ -5039,23 +6223,28 @@ again:  	found_type = btrfs_key_type(&found_key);  	if (found_key.objectid != objectid ||  	    found_type != BTRFS_EXTENT_DATA_KEY) { -		goto not_found; +		/* +		 * If we backup past the first extent we want to move forward +		 * and see if there is an extent in front of us, otherwise we'll +		 * say there is a hole for our whole search range which can +		 * cause problems. +		 */ +		extent_end = start; +		goto next;  	}  	found_type = btrfs_file_extent_type(leaf, item);  	extent_start = found_key.offset; -	compressed = btrfs_file_extent_compression(leaf, item);  	if (found_type == BTRFS_FILE_EXTENT_REG ||  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {  		extent_end = extent_start +  		       btrfs_file_extent_num_bytes(leaf, item);  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		size_t size; -		size = btrfs_file_extent_inline_len(leaf, item); -		extent_end = (extent_start + size + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); +		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); +		extent_end = ALIGN(extent_start + size, root->sectorsize);  	} - +next:  	if (start >= extent_end) {  		path->slots[0]++;  		if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -5075,33 +6264,15 @@ again:  		if (start + len <= found_key.offset)  			goto not_found;  		em->start = start; +		em->orig_start = start;  		em->len = found_key.offset - start;  		goto not_found_em;  	} +	btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); +  	if (found_type == BTRFS_FILE_EXTENT_REG ||  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) { -		em->start = extent_start; -		em->len = extent_end - extent_start; -		em->orig_start = extent_start - -				 btrfs_file_extent_offset(leaf, item); -		bytenr = btrfs_file_extent_disk_bytenr(leaf, item); -		if (bytenr == 0) { -			em->block_start = EXTENT_MAP_HOLE; -			goto insert; -		} -		if (compressed) { -			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); -			em->block_start = bytenr; -			em->block_len = btrfs_file_extent_disk_num_bytes(leaf, -									 item); -		} else { -			bytenr += btrfs_file_extent_offset(leaf, item); -			em->block_start = bytenr; -			em->block_len = em->len; -			if (found_type == BTRFS_FILE_EXTENT_PREALLOC) -				set_bit(EXTENT_FLAG_PREALLOC, &em->flags); -		}  		goto insert;  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {  		unsigned long ptr; @@ -5110,31 +6281,28 @@ again:  		size_t extent_offset;  		size_t copy_size; -		em->block_start = EXTENT_MAP_INLINE; -		if (!page || create) { -			em->start = extent_start; -			em->len = extent_end - extent_start; +		if (new_inline)  			goto out; -		} -		size = btrfs_file_extent_inline_len(leaf, item); +		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);  		extent_offset = page_offset(page) + pg_offset - extent_start;  		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,  				size - extent_offset);  		em->start = extent_start + extent_offset; -		em->len = (copy_size + root->sectorsize - 1) & -			~((u64)root->sectorsize - 1); -		em->orig_start = EXTENT_MAP_INLINE; -		if (compressed) -			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +		em->len = ALIGN(copy_size, root->sectorsize); +		em->orig_block_len = em->len; +		em->orig_start = em->start;  		ptr = btrfs_file_extent_inline_start(item) + extent_offset;  		if (create == 0 && !PageUptodate(page)) { -			if (btrfs_file_extent_compression(leaf, item) == -			    BTRFS_COMPRESS_ZLIB) { +			if (btrfs_file_extent_compression(leaf, item) != +			    BTRFS_COMPRESS_NONE) {  				ret = uncompress_inline(path, inode, page,  							pg_offset,  							extent_offset, item); -				BUG_ON(ret); +				if (ret) { +					err = ret; +					goto out; +				}  			} else {  				map = kmap(page);  				read_extent_buffer(leaf, map + pg_offset, ptr, @@ -5148,13 +6316,17 @@ again:  			}  			flush_dcache_page(page);  		} else if (create && PageUptodate(page)) { -			WARN_ON(1); +			BUG();  			if (!trans) {  				kunmap(page);  				free_extent_map(em);  				em = NULL; -				btrfs_release_path(root, path); -				trans = btrfs_join_transaction(root, 1); + +				btrfs_release_path(path); +				trans = btrfs_join_transaction(root); + +				if (IS_ERR(trans)) +					return ERR_CAST(trans);  				goto again;  			}  			map = kmap(page); @@ -5164,33 +6336,28 @@ again:  			btrfs_mark_buffer_dirty(leaf);  		}  		set_extent_uptodate(io_tree, em->start, -				    extent_map_end(em) - 1, GFP_NOFS); +				    extent_map_end(em) - 1, NULL, GFP_NOFS);  		goto insert; -	} else { -		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); -		WARN_ON(1);  	}  not_found:  	em->start = start; +	em->orig_start = start;  	em->len = len;  not_found_em:  	em->block_start = EXTENT_MAP_HOLE;  	set_bit(EXTENT_FLAG_VACANCY, &em->flags);  insert: -	btrfs_release_path(root, path); +	btrfs_release_path(path);  	if (em->start > start || extent_map_end(em) <= start) { -		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " -		       "[%llu %llu]\n", (unsigned long long)em->start, -		       (unsigned long long)em->len, -		       (unsigned long long)start, -		       (unsigned long long)len); +		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", +			em->start, em->len, start, len);  		err = -EIO;  		goto out;  	}  	err = 0;  	write_lock(&em_tree->lock); -	ret = add_extent_mapping(em_tree, em); +	ret = add_extent_mapping(em_tree, em, 0);  	/* it is possible that someone inserted the extent into the tree  	 * while we had the lock dropped.  It is also possible that  	 * an overlapping map exists in the tree @@ -5231,6 +6398,9 @@ insert:  	}  	write_unlock(&em_tree->lock);  out: + +	trace_btrfs_get_extent(root, em); +  	if (path)  		btrfs_free_path(path);  	if (trans) { @@ -5242,68 +6412,167 @@ out:  		free_extent_map(em);  		return ERR_PTR(err);  	} +	BUG_ON(!em); /* Error is always set */  	return em;  } -static struct extent_map *btrfs_new_extent_direct(struct inode *inode, -						  u64 start, u64 len) +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, +					   size_t pg_offset, u64 start, u64 len, +					   int create)  { -	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	struct extent_map *em; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	struct btrfs_key ins; -	u64 alloc_hint; -	int ret; +	struct extent_map *hole_em = NULL; +	u64 range_start = start; +	u64 end; +	u64 found; +	u64 found_end; +	int err = 0; -	btrfs_drop_extent_cache(inode, start, start + len - 1, 0); +	em = btrfs_get_extent(inode, page, pg_offset, start, len, create); +	if (IS_ERR(em)) +		return em; +	if (em) { +		/* +		 * if our em maps to +		 * -  a hole or +		 * -  a pre-alloc extent, +		 * there might actually be delalloc bytes behind it. +		 */ +		if (em->block_start != EXTENT_MAP_HOLE && +		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) +			return em; +		else +			hole_em = em; +	} -	trans = btrfs_join_transaction(root, 0); -	if (!trans) -		return ERR_PTR(-ENOMEM); +	/* check to see if we've wrapped (len == -1 or similar) */ +	end = start + len; +	if (end < start) +		end = (u64)-1; +	else +		end -= 1; -	trans->block_rsv = &root->fs_info->delalloc_block_rsv; +	em = NULL; -	alloc_hint = get_extent_allocation_hint(inode, start, len); -	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, -				   alloc_hint, (u64)-1, &ins, 1); -	if (ret) { -		em = ERR_PTR(ret); +	/* ok, we didn't find anything, lets look for delalloc */ +	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, +				 end, len, EXTENT_DELALLOC, 1); +	found_end = range_start + found; +	if (found_end < range_start) +		found_end = (u64)-1; + +	/* +	 * we didn't find anything useful, return +	 * the original results from get_extent() +	 */ +	if (range_start > end || found_end <= start) { +		em = hole_em; +		hole_em = NULL;  		goto out;  	} -	em = alloc_extent_map(GFP_NOFS); -	if (!em) { -		em = ERR_PTR(-ENOMEM); -		goto out; +	/* adjust the range_start to make sure it doesn't +	 * go backwards from the start they passed in +	 */ +	range_start = max(start, range_start); +	found = found_end - range_start; + +	if (found > 0) { +		u64 hole_start = start; +		u64 hole_len = len; + +		em = alloc_extent_map(); +		if (!em) { +			err = -ENOMEM; +			goto out; +		} +		/* +		 * when btrfs_get_extent can't find anything it +		 * returns one huge hole +		 * +		 * make sure what it found really fits our range, and +		 * adjust to make sure it is based on the start from +		 * the caller +		 */ +		if (hole_em) { +			u64 calc_end = extent_map_end(hole_em); + +			if (calc_end <= start || (hole_em->start > end)) { +				free_extent_map(hole_em); +				hole_em = NULL; +			} else { +				hole_start = max(hole_em->start, start); +				hole_len = calc_end - hole_start; +			} +		} +		em->bdev = NULL; +		if (hole_em && range_start > hole_start) { +			/* our hole starts before our delalloc, so we +			 * have to return just the parts of the hole +			 * that go until  the delalloc starts +			 */ +			em->len = min(hole_len, +				      range_start - hole_start); +			em->start = hole_start; +			em->orig_start = hole_start; +			/* +			 * don't adjust block start at all, +			 * it is fixed at EXTENT_MAP_HOLE +			 */ +			em->block_start = hole_em->block_start; +			em->block_len = hole_len; +			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) +				set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		} else { +			em->start = range_start; +			em->len = found; +			em->orig_start = range_start; +			em->block_start = EXTENT_MAP_DELALLOC; +			em->block_len = found; +		} +	} else if (hole_em) { +		return hole_em;  	} +out: -	em->start = start; -	em->orig_start = em->start; -	em->len = ins.offset; +	free_extent_map(hole_em); +	if (err) { +		free_extent_map(em); +		return ERR_PTR(err); +	} +	return em; +} -	em->block_start = ins.objectid; -	em->block_len = ins.offset; -	em->bdev = root->fs_info->fs_devices->latest_bdev; -	set_bit(EXTENT_FLAG_PINNED, &em->flags); +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, +						  u64 start, u64 len) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_map *em; +	struct btrfs_key ins; +	u64 alloc_hint; +	int ret; -	while (1) { -		write_lock(&em_tree->lock); -		ret = add_extent_mapping(em_tree, em); -		write_unlock(&em_tree->lock); -		if (ret != -EEXIST) -			break; -		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); +	alloc_hint = get_extent_allocation_hint(inode, start, len); +	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, +				   alloc_hint, &ins, 1, 1); +	if (ret) +		return ERR_PTR(ret); + +	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, +			      ins.offset, ins.offset, ins.offset, 0); +	if (IS_ERR(em)) { +		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +		return em;  	}  	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,  					   ins.offset, ins.offset, 0);  	if (ret) { -		btrfs_free_reserved_extent(root, ins.objectid, ins.offset); -		em = ERR_PTR(ret); +		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +		free_extent_map(em); +		return ERR_PTR(ret);  	} -out: -	btrfs_end_transaction(trans, root); +  	return em;  } @@ -5311,13 +6580,16 @@ out:   * returns 1 when the nocow is safe, < 1 on error, 0 if the   * block must be cow'd   */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, -				      struct inode *inode, u64 offset, u64 len) +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +			      u64 *orig_start, u64 *orig_block_len, +			      u64 *ram_bytes)  { +	struct btrfs_trans_handle *trans;  	struct btrfs_path *path;  	int ret;  	struct extent_buffer *leaf;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_file_extent_item *fi;  	struct btrfs_key key;  	u64 disk_bytenr; @@ -5326,12 +6598,13 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,  	u64 num_bytes;  	int slot;  	int found_type; +	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, +	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),  				       offset, 0);  	if (ret < 0)  		goto out; @@ -5348,7 +6621,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,  	ret = 0;  	leaf = path->nodes[0];  	btrfs_item_key_to_cpu(leaf, &key, slot); -	if (key.objectid != inode->i_ino || +	if (key.objectid != btrfs_ino(inode) ||  	    key.type != BTRFS_EXTENT_DATA_KEY) {  		/* not our file or wrong item type, must cow */  		goto out; @@ -5366,25 +6639,66 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,  		/* not a regular extent, must cow */  		goto out;  	} -	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); -	backref_offset = btrfs_file_extent_offset(leaf, fi); + +	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) +		goto out;  	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); -	if (extent_end < offset + len) { -		/* extent doesn't include our full range, must cow */ +	if (extent_end <= offset) +		goto out; + +	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); +	if (disk_bytenr == 0)  		goto out; + +	if (btrfs_file_extent_compression(leaf, fi) || +	    btrfs_file_extent_encryption(leaf, fi) || +	    btrfs_file_extent_other_encoding(leaf, fi)) +		goto out; + +	backref_offset = btrfs_file_extent_offset(leaf, fi); + +	if (orig_start) { +		*orig_start = key.offset - backref_offset; +		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); +		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);  	}  	if (btrfs_extent_readonly(root, disk_bytenr))  		goto out; +	num_bytes = min(offset + *len, extent_end) - offset; +	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { +		u64 range_end; + +		range_end = round_up(offset + num_bytes, root->sectorsize) - 1; +		ret = test_range_bit(io_tree, offset, range_end, +				     EXTENT_DELALLOC, 0, NULL); +		if (ret) { +			ret = -EAGAIN; +			goto out; +		} +	} + +	btrfs_release_path(path); +  	/*  	 * look for other files referencing this extent, if we  	 * find any we must cow  	 */ -	if (btrfs_cross_ref_exist(trans, root, inode->i_ino, -				  key.offset - backref_offset, disk_bytenr)) +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		ret = 0;  		goto out; +	} + +	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), +				    key.offset - backref_offset, disk_bytenr); +	btrfs_end_transaction(trans, root); +	if (ret) { +		ret = 0; +		goto out; +	}  	/*  	 * adjust disk_bytenr and num_bytes to cover just the bytes @@ -5394,31 +6708,229 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,  	 */  	disk_bytenr += backref_offset;  	disk_bytenr += offset - key.offset; -	num_bytes = min(offset + len, extent_end) - offset;  	if (csum_exist_in_range(root, disk_bytenr, num_bytes))  				goto out;  	/*  	 * all of the above have passed, it is safe to overwrite this extent  	 * without cow  	 */ +	*len = num_bytes;  	ret = 1;  out:  	btrfs_free_path(path);  	return ret;  } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ +	struct radix_tree_root *root = &inode->i_mapping->page_tree; +	int found = false; +	void **pagep = NULL; +	struct page *page = NULL; +	int start_idx; +	int end_idx; + +	start_idx = start >> PAGE_CACHE_SHIFT; + +	/* +	 * end is the last byte in the last page.  end == start is legal +	 */ +	end_idx = end >> PAGE_CACHE_SHIFT; + +	rcu_read_lock(); + +	/* Most of the code in this while loop is lifted from +	 * find_get_page.  It's been modified to begin searching from a +	 * page and return just the first page found in that range.  If the +	 * found idx is less than or equal to the end idx then we know that +	 * a page exists.  If no pages are found or if those pages are +	 * outside of the range then we're fine (yay!) */ +	while (page == NULL && +	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { +		page = radix_tree_deref_slot(pagep); +		if (unlikely(!page)) +			break; + +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) { +				page = NULL; +				continue; +			} +			/* +			 * Otherwise, shmem/tmpfs must be storing a swap entry +			 * here as an exceptional entry: so return it without +			 * attempting to raise page count. +			 */ +			page = NULL; +			break; /* TODO: Is this relevant for this use case? */ +		} + +		if (!page_cache_get_speculative(page)) { +			page = NULL; +			continue; +		} + +		/* +		 * Has the page moved? +		 * This is part of the lockless pagecache protocol. See +		 * include/linux/pagemap.h for details. +		 */ +		if (unlikely(page != *pagep)) { +			page_cache_release(page); +			page = NULL; +		} +	} + +	if (page) { +		if (page->index <= end_idx) +			found = true; +		page_cache_release(page); +	} + +	rcu_read_unlock(); +	return found; +} + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, +			      struct extent_state **cached_state, int writing) +{ +	struct btrfs_ordered_extent *ordered; +	int ret = 0; + +	while (1) { +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, +				 0, cached_state); +		/* +		 * We're concerned with the entire range that we're going to be +		 * doing DIO to, so we need to make sure theres no ordered +		 * extents in this range. +		 */ +		ordered = btrfs_lookup_ordered_range(inode, lockstart, +						     lockend - lockstart + 1); + +		/* +		 * We need to make sure there are no buffered pages in this +		 * range either, we could have raced between the invalidate in +		 * generic_file_direct_write and locking the extent.  The +		 * invalidate needs to happen so that reads after a write do not +		 * get stale data. +		 */ +		if (!ordered && +		    (!writing || +		     !btrfs_page_exists_in_range(inode, lockstart, lockend))) +			break; + +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, +				     cached_state, GFP_NOFS); + +		if (ordered) { +			btrfs_start_ordered_extent(inode, ordered, 1); +			btrfs_put_ordered_extent(ordered); +		} else { +			/* Screw you mmap */ +			ret = filemap_write_and_wait_range(inode->i_mapping, +							   lockstart, +							   lockend); +			if (ret) +				break; + +			/* +			 * If we found a page that couldn't be invalidated just +			 * fall back to buffered. +			 */ +			ret = invalidate_inode_pages2_range(inode->i_mapping, +					lockstart >> PAGE_CACHE_SHIFT, +					lockend >> PAGE_CACHE_SHIFT); +			if (ret) +				break; +		} + +		cond_resched(); +	} + +	return ret; +} + +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, +					   u64 len, u64 orig_start, +					   u64 block_start, u64 block_len, +					   u64 orig_block_len, u64 ram_bytes, +					   int type) +{ +	struct extent_map_tree *em_tree; +	struct extent_map *em; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	em_tree = &BTRFS_I(inode)->extent_tree; +	em = alloc_extent_map(); +	if (!em) +		return ERR_PTR(-ENOMEM); + +	em->start = start; +	em->orig_start = orig_start; +	em->mod_start = start; +	em->mod_len = len; +	em->len = len; +	em->block_len = block_len; +	em->block_start = block_start; +	em->bdev = root->fs_info->fs_devices->latest_bdev; +	em->orig_block_len = orig_block_len; +	em->ram_bytes = ram_bytes; +	em->generation = -1; +	set_bit(EXTENT_FLAG_PINNED, &em->flags); +	if (type == BTRFS_ORDERED_PREALLOC) +		set_bit(EXTENT_FLAG_FILLING, &em->flags); + +	do { +		btrfs_drop_extent_cache(inode, em->start, +				em->start + em->len - 1, 0); +		write_lock(&em_tree->lock); +		ret = add_extent_mapping(em_tree, em, 1); +		write_unlock(&em_tree->lock); +	} while (ret == -EEXIST); + +	if (ret) { +		free_extent_map(em); +		return ERR_PTR(ret); +	} + +	return em; +} + +  static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  				   struct buffer_head *bh_result, int create)  {  	struct extent_map *em;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct extent_state *cached_state = NULL;  	u64 start = iblock << inode->i_blkbits; +	u64 lockstart, lockend;  	u64 len = bh_result->b_size; -	struct btrfs_trans_handle *trans; +	int unlock_bits = EXTENT_LOCKED; +	int ret = 0; + +	if (create) +		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; +	else +		len = min_t(u64, len, root->sectorsize); + +	lockstart = start; +	lockend = start + len - 1; + +	/* +	 * If this errors out it's because we couldn't invalidate pagecache for +	 * this range and we need to fallback to buffered. +	 */ +	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) +		return -ENOTBLK;  	em = btrfs_get_extent(inode, NULL, 0, start, len, 0); -	if (IS_ERR(em)) -		return PTR_ERR(em); +	if (IS_ERR(em)) { +		ret = PTR_ERR(em); +		goto unlock_err; +	}  	/*  	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered @@ -5437,17 +6949,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||  	    em->block_start == EXTENT_MAP_INLINE) {  		free_extent_map(em); -		return -ENOTBLK; +		ret = -ENOTBLK; +		goto unlock_err;  	}  	/* Just a good old fashioned hole, return */  	if (!create && (em->block_start == EXTENT_MAP_HOLE ||  			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {  		free_extent_map(em); -		/* DIO will do one hole at a time, so just unlock a sector */ -		unlock_extent(&BTRFS_I(inode)->io_tree, start, -			      start + root->sectorsize - 1, GFP_NOFS); -		return 0; +		goto unlock_err;  	}  	/* @@ -5460,8 +6970,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	 *  	 */  	if (!create) { -		len = em->len - (start - em->start); -		goto map; +		len = min(len, em->len - (start - em->start)); +		lockstart = start + len; +		goto unlock;  	}  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || @@ -5469,7 +6980,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	     em->block_start != EXTENT_MAP_HOLE)) {  		int type;  		int ret; -		u64 block_start; +		u64 block_start, orig_start, orig_block_len, ram_bytes;  		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))  			type = BTRFS_ORDERED_PREALLOC; @@ -5478,28 +6989,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  		len = min(len, em->len - (start - em->start));  		block_start = em->block_start + (start - em->start); -		/* -		 * we're not going to log anything, but we do need -		 * to make sure the current transaction stays open -		 * while we look for nocow cross refs -		 */ -		trans = btrfs_join_transaction(root, 0); -		if (!trans) -			goto must_cow; +		if (can_nocow_extent(inode, start, &len, &orig_start, +				     &orig_block_len, &ram_bytes) == 1) { +			if (type == BTRFS_ORDERED_PREALLOC) { +				free_extent_map(em); +				em = create_pinned_em(inode, start, len, +						       orig_start, +						       block_start, len, +						       orig_block_len, +						       ram_bytes, type); +				if (IS_ERR(em)) +					goto unlock_err; +			} -		if (can_nocow_odirect(trans, inode, start, len) == 1) {  			ret = btrfs_add_ordered_extent_dio(inode, start,  					   block_start, len, len, type); -			btrfs_end_transaction(trans, root);  			if (ret) {  				free_extent_map(em); -				return ret; +				goto unlock_err;  			}  			goto unlock;  		} -		btrfs_end_transaction(trans, root);  	} -must_cow: +  	/*  	 * this will cow the extent, reset the len in case we changed  	 * it above @@ -5507,48 +7019,74 @@ must_cow:  	len = bh_result->b_size;  	free_extent_map(em);  	em = btrfs_new_extent_direct(inode, start, len); -	if (IS_ERR(em)) -		return PTR_ERR(em); +	if (IS_ERR(em)) { +		ret = PTR_ERR(em); +		goto unlock_err; +	}  	len = min(len, em->len - (start - em->start));  unlock: -	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, -			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, -			  0, NULL, GFP_NOFS); -map:  	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>  		inode->i_blkbits;  	bh_result->b_size = len;  	bh_result->b_bdev = em->bdev;  	set_buffer_mapped(bh_result); -	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) -		set_buffer_new(bh_result); +	if (create) { +		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) +			set_buffer_new(bh_result); + +		/* +		 * Need to update the i_size under the extent lock so buffered +		 * readers will get the updated i_size when we unlock. +		 */ +		if (start + len > i_size_read(inode)) +			i_size_write(inode, start + len); + +		spin_lock(&BTRFS_I(inode)->lock); +		BTRFS_I(inode)->outstanding_extents++; +		spin_unlock(&BTRFS_I(inode)->lock); + +		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, +				     lockstart + len - 1, EXTENT_DELALLOC, NULL, +				     &cached_state, GFP_NOFS); +		BUG_ON(ret); +	} + +	/* +	 * In the case of write we need to clear and unlock the entire range, +	 * in the case of read we need to unlock only the end area that we +	 * aren't using if there is any left over space. +	 */ +	if (lockstart < lockend) { +		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, +				 lockend, unlock_bits, 1, 0, +				 &cached_state, GFP_NOFS); +	} else { +		free_extent_state(cached_state); +	}  	free_extent_map(em);  	return 0; -} -struct btrfs_dio_private { -	struct inode *inode; -	u64 logical_offset; -	u64 disk_bytenr; -	u64 bytes; -	u32 *csums; -	void *private; -}; +unlock_err: +	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, +			 unlock_bits, 1, 0, &cached_state, GFP_NOFS); +	return ret; +}  static void btrfs_endio_direct_read(struct bio *bio, int err)  { -	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; -	struct bio_vec *bvec = bio->bi_io_vec;  	struct btrfs_dio_private *dip = bio->bi_private; +	struct bio_vec *bvec;  	struct inode *inode = dip->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct bio *dio_bio; +	u32 *csums = (u32 *)dip->csum;  	u64 start; -	u32 *private = dip->csums; +	int i;  	start = dip->logical_offset; -	do { +	bio_for_each_segment_all(bvec, bio, i) {  		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {  			struct page *page = bvec->bv_page;  			char *kaddr; @@ -5556,35 +7094,36 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)  			unsigned long flags;  			local_irq_save(flags); -			kaddr = kmap_atomic(page, KM_IRQ0); -			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, +			kaddr = kmap_atomic(page); +			csum = btrfs_csum_data(kaddr + bvec->bv_offset,  					       csum, bvec->bv_len);  			btrfs_csum_final(csum, (char *)&csum); -			kunmap_atomic(kaddr, KM_IRQ0); +			kunmap_atomic(kaddr);  			local_irq_restore(flags);  			flush_dcache_page(bvec->bv_page); -			if (csum != *private) { -				printk(KERN_ERR "btrfs csum failed ino %lu off" -				      " %llu csum %u private %u\n", -				      inode->i_ino, (unsigned long long)start, -				      csum, *private); +			if (csum != csums[i]) { +				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", +					  btrfs_ino(inode), start, csum, +					  csums[i]);  				err = -EIO;  			}  		}  		start += bvec->bv_len; -		private++; -		bvec++; -	} while (bvec <= bvec_end); +	}  	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, -		      dip->logical_offset + dip->bytes - 1, GFP_NOFS); -	bio->bi_private = dip->private; +		      dip->logical_offset + dip->bytes - 1); +	dio_bio = dip->dio_bio; -	kfree(dip->csums);  	kfree(dip); -	dio_end_io(bio, err); + +	/* If we had a csum failure make sure to clear the uptodate flag */ +	if (err) +		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); +	dio_end_io(dio_bio, err); +	bio_put(bio);  }  static void btrfs_endio_direct_write(struct bio *bio, int err) @@ -5592,167 +7131,320 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)  	struct btrfs_dio_private *dip = bio->bi_private;  	struct inode *inode = dip->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	struct btrfs_ordered_extent *ordered = NULL; -	struct extent_state *cached_state = NULL; +	u64 ordered_offset = dip->logical_offset; +	u64 ordered_bytes = dip->bytes; +	struct bio *dio_bio;  	int ret;  	if (err)  		goto out_done; - -	ret = btrfs_dec_test_ordered_pending(inode, &ordered, -					     dip->logical_offset, dip->bytes); +again: +	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, +						   &ordered_offset, +						   ordered_bytes, !err);  	if (!ret) -		goto out_done; +		goto out_test; -	BUG_ON(!ordered); - -	trans = btrfs_join_transaction(root, 1); -	if (!trans) { -		err = -ENOMEM; -		goto out; +	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); +	btrfs_queue_work(root->fs_info->endio_write_workers, +			 &ordered->work); +out_test: +	/* +	 * our bio might span multiple ordered extents.  If we haven't +	 * completed the accounting for the whole dio, go back and try again +	 */ +	if (ordered_offset < dip->logical_offset + dip->bytes) { +		ordered_bytes = dip->logical_offset + dip->bytes - +			ordered_offset; +		ordered = NULL; +		goto again;  	} -	trans->block_rsv = &root->fs_info->delalloc_block_rsv; +out_done: +	dio_bio = dip->dio_bio; -	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { -		ret = btrfs_ordered_update_i_size(inode, 0, ordered); -		if (!ret) -			ret = btrfs_update_inode(trans, root, inode); -		err = ret; -		goto out; +	kfree(dip); + +	/* If we had an error make sure to clear the uptodate flag */ +	if (err) +		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); +	dio_end_io(dio_bio, err); +	bio_put(bio); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, +				    struct bio *bio, int mirror_num, +				    unsigned long bio_flags, u64 offset) +{ +	int ret; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); +	BUG_ON(ret); /* -ENOMEM */ +	return 0; +} + +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ +	struct btrfs_dio_private *dip = bio->bi_private; + +	if (err) { +		btrfs_err(BTRFS_I(dip->inode)->root->fs_info, +			  "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", +		      btrfs_ino(dip->inode), bio->bi_rw, +		      (unsigned long long)bio->bi_iter.bi_sector, +		      bio->bi_iter.bi_size, err); +		dip->errors = 1; + +		/* +		 * before atomic variable goto zero, we must make sure +		 * dip->errors is perceived to be set. +		 */ +		smp_mb__before_atomic();  	} -	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, -			 ordered->file_offset + ordered->len - 1, 0, -			 &cached_state, GFP_NOFS); +	/* if there are more bios still pending for this dio, just exit */ +	if (!atomic_dec_and_test(&dip->pending_bios)) +		goto out; -	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { -		ret = btrfs_mark_extent_written(trans, inode, -						ordered->file_offset, -						ordered->file_offset + -						ordered->len); -		if (ret) { -			err = ret; -			goto out_unlock; -		} +	if (dip->errors) { +		bio_io_error(dip->orig_bio);  	} else { -		ret = insert_reserved_file_extent(trans, inode, -						  ordered->file_offset, -						  ordered->start, -						  ordered->disk_len, -						  ordered->len, -						  ordered->len, -						  0, 0, 0, -						  BTRFS_FILE_EXTENT_REG); -		unpin_extent_cache(&BTRFS_I(inode)->extent_tree, -				   ordered->file_offset, ordered->len); -		if (ret) { -			err = ret; -			WARN_ON(1); -			goto out_unlock; -		} +		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); +		bio_endio(dip->orig_bio, 0);  	} - -	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); -	btrfs_ordered_update_i_size(inode, 0, ordered); -	btrfs_update_inode(trans, root, inode); -out_unlock: -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, -			     ordered->file_offset + ordered->len - 1, -			     &cached_state, GFP_NOFS);  out: -	btrfs_delalloc_release_metadata(inode, ordered->len); -	btrfs_end_transaction(trans, root); -	btrfs_put_ordered_extent(ordered); -	btrfs_put_ordered_extent(ordered); -out_done: -	bio->bi_private = dip->private; +	bio_put(bio); +} -	kfree(dip->csums); -	kfree(dip); -	dio_end_io(bio, err); +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, +				       u64 first_sector, gfp_t gfp_flags) +{ +	int nr_vecs = bio_get_nr_vecs(bdev); +	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);  } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, -				    struct bio *bio, int mirror_num, -				    unsigned long bio_flags, u64 offset) +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, +					 int rw, u64 file_offset, int skip_sum, +					 int async_submit)  { +	struct btrfs_dio_private *dip = bio->bi_private; +	int write = rw & REQ_WRITE; +	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret; + +	if (async_submit) +		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + +	bio_get(bio); + +	if (!write) { +		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); +		if (ret) +			goto err; +	} + +	if (skip_sum) +		goto map; + +	if (write && async_submit) { +		ret = btrfs_wq_submit_bio(root->fs_info, +				   inode, rw, bio, 0, 0, +				   file_offset, +				   __btrfs_submit_bio_start_direct_io, +				   __btrfs_submit_bio_done); +		goto err; +	} else if (write) { +		/* +		 * If we aren't doing async submit, calculate the csum of the +		 * bio now. +		 */ +		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); +		if (ret) +			goto err; +	} else if (!skip_sum) { +		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, +						file_offset); +		if (ret) +			goto err; +	} + +map: +	ret = btrfs_map_bio(root, rw, bio, 0, async_submit); +err: +	bio_put(bio); +	return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, +				    int skip_sum) +{ +	struct inode *inode = dip->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); -	BUG_ON(ret); +	struct bio *bio; +	struct bio *orig_bio = dip->orig_bio; +	struct bio_vec *bvec = orig_bio->bi_io_vec; +	u64 start_sector = orig_bio->bi_iter.bi_sector; +	u64 file_offset = dip->logical_offset; +	u64 submit_len = 0; +	u64 map_length; +	int nr_pages = 0; +	int ret = 0; +	int async_submit = 0; + +	map_length = orig_bio->bi_iter.bi_size; +	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, +			      &map_length, NULL, 0); +	if (ret) { +		bio_put(orig_bio); +		return -EIO; +	} + +	if (map_length >= orig_bio->bi_iter.bi_size) { +		bio = orig_bio; +		goto submit; +	} + +	/* async crcs make it difficult to collect full stripe writes. */ +	if (btrfs_get_alloc_profile(root, 1) & +	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) +		async_submit = 0; +	else +		async_submit = 1; + +	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); +	if (!bio) +		return -ENOMEM; +	bio->bi_private = dip; +	bio->bi_end_io = btrfs_end_dio_bio; +	atomic_inc(&dip->pending_bios); + +	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { +		if (unlikely(map_length < submit_len + bvec->bv_len || +		    bio_add_page(bio, bvec->bv_page, bvec->bv_len, +				 bvec->bv_offset) < bvec->bv_len)) { +			/* +			 * inc the count before we submit the bio so +			 * we know the end IO handler won't happen before +			 * we inc the count. Otherwise, the dip might get freed +			 * before we're done setting it up +			 */ +			atomic_inc(&dip->pending_bios); +			ret = __btrfs_submit_dio_bio(bio, inode, rw, +						     file_offset, skip_sum, +						     async_submit); +			if (ret) { +				bio_put(bio); +				atomic_dec(&dip->pending_bios); +				goto out_err; +			} + +			start_sector += submit_len >> 9; +			file_offset += submit_len; + +			submit_len = 0; +			nr_pages = 0; + +			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, +						  start_sector, GFP_NOFS); +			if (!bio) +				goto out_err; +			bio->bi_private = dip; +			bio->bi_end_io = btrfs_end_dio_bio; + +			map_length = orig_bio->bi_iter.bi_size; +			ret = btrfs_map_block(root->fs_info, rw, +					      start_sector << 9, +					      &map_length, NULL, 0); +			if (ret) { +				bio_put(bio); +				goto out_err; +			} +		} else { +			submit_len += bvec->bv_len; +			nr_pages++; +			bvec++; +		} +	} + +submit: +	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, +				     async_submit); +	if (!ret) +		return 0; + +	bio_put(bio); +out_err: +	dip->errors = 1; +	/* +	 * before atomic variable goto zero, we must +	 * make sure dip->errors is perceived to be set. +	 */ +	smp_mb__before_atomic(); +	if (atomic_dec_and_test(&dip->pending_bios)) +		bio_io_error(dip->orig_bio); + +	/* bio_end_io() will handle error, so we needn't return it */  	return 0;  } -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, -				loff_t file_offset) +static void btrfs_submit_direct(int rw, struct bio *dio_bio, +				struct inode *inode, loff_t file_offset)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_dio_private *dip; -	struct bio_vec *bvec = bio->bi_io_vec; +	struct bio *io_bio;  	int skip_sum; +	int sum_len;  	int write = rw & REQ_WRITE;  	int ret = 0; +	u16 csum_size;  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -	dip = kmalloc(sizeof(*dip), GFP_NOFS); -	if (!dip) { +	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); +	if (!io_bio) {  		ret = -ENOMEM;  		goto free_ordered;  	} -	dip->csums = NULL; -	if (!skip_sum) { -		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); -		if (!dip->csums) { -			ret = -ENOMEM; -			goto free_ordered; -		} +	if (!skip_sum && !write) { +		csum_size = btrfs_super_csum_size(root->fs_info->super_copy); +		sum_len = dio_bio->bi_iter.bi_size >> +			inode->i_sb->s_blocksize_bits; +		sum_len *= csum_size; +	} else { +		sum_len = 0; +	} + +	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); +	if (!dip) { +		ret = -ENOMEM; +		goto free_io_bio;  	} -	dip->private = bio->bi_private; +	dip->private = dio_bio->bi_private;  	dip->inode = inode;  	dip->logical_offset = file_offset; - -	dip->bytes = 0; -	do { -		dip->bytes += bvec->bv_len; -		bvec++; -	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); - -	dip->disk_bytenr = (u64)bio->bi_sector << 9; -	bio->bi_private = dip; +	dip->bytes = dio_bio->bi_iter.bi_size; +	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; +	io_bio->bi_private = dip; +	dip->errors = 0; +	dip->orig_bio = io_bio; +	dip->dio_bio = dio_bio; +	atomic_set(&dip->pending_bios, 0);  	if (write) -		bio->bi_end_io = btrfs_endio_direct_write; +		io_bio->bi_end_io = btrfs_endio_direct_write;  	else -		bio->bi_end_io = btrfs_endio_direct_read; - -	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); -	if (ret) -		goto out_err; +		io_bio->bi_end_io = btrfs_endio_direct_read; -	if (write && !skip_sum) { -		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, -				   inode, rw, bio, 0, 0, -				   dip->logical_offset, -				   __btrfs_submit_bio_start_direct_io, -				   __btrfs_submit_bio_done); -		if (ret) -			goto out_err; +	ret = btrfs_submit_direct_hook(rw, dip, skip_sum); +	if (!ret)  		return; -	} else if (!skip_sum) -		btrfs_lookup_bio_sums_dio(root, inode, bio, -					  dip->logical_offset, dip->csums); -	ret = btrfs_map_bio(root, rw, bio, 0, 1); -	if (ret) -		goto out_err; -	return; -out_err: -	kfree(dip->csums); -	kfree(dip); +free_io_bio: +	bio_put(io_bio); +  free_ordered:  	/*  	 * If this is a write, we need to clean up the reserved space and kill @@ -5760,147 +7452,139 @@ free_ordered:  	 */  	if (write) {  		struct btrfs_ordered_extent *ordered; -		ordered = btrfs_lookup_ordered_extent(inode, -						      dip->logical_offset); +		ordered = btrfs_lookup_ordered_extent(inode, file_offset);  		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))  			btrfs_free_reserved_extent(root, ordered->start, -						   ordered->disk_len); +						   ordered->disk_len, 1);  		btrfs_put_ordered_extent(ordered);  		btrfs_put_ordered_extent(ordered);  	} -	bio_endio(bio, ret); +	bio_endio(dio_bio, ret);  }  static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, -			const struct iovec *iov, loff_t offset, -			unsigned long nr_segs) +			const struct iov_iter *iter, loff_t offset)  {  	int seg; -	size_t size; -	unsigned long addr; +	int i;  	unsigned blocksize_mask = root->sectorsize - 1;  	ssize_t retval = -EINVAL; -	loff_t end = offset;  	if (offset & blocksize_mask)  		goto out; -	/* Check the memory alignment.  Blocks cannot straddle pages */ -	for (seg = 0; seg < nr_segs; seg++) { -		addr = (unsigned long)iov[seg].iov_base; -		size = iov[seg].iov_len; -		end += size; -		if ((addr & blocksize_mask) || (size & blocksize_mask))  -			goto out; +	if (iov_iter_alignment(iter) & blocksize_mask) +		goto out; + +	/* If this is a write we don't need to check anymore */ +	if (rw & WRITE) +		return 0; +	/* +	 * Check to make sure we don't have duplicate iov_base's in this +	 * iovec, if so return EINVAL, otherwise we'll get csum errors +	 * when reading back. +	 */ +	for (seg = 0; seg < iter->nr_segs; seg++) { +		for (i = seg + 1; i < iter->nr_segs; i++) { +			if (iter->iov[seg].iov_base == iter->iov[i].iov_base) +				goto out; +		}  	}  	retval = 0;  out:  	return retval;  } +  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, -			const struct iovec *iov, loff_t offset, -			unsigned long nr_segs) +			struct iov_iter *iter, loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; -	struct btrfs_ordered_extent *ordered; -	struct extent_state *cached_state = NULL; -	u64 lockstart, lockend; +	size_t count = 0; +	int flags = 0; +	bool wakeup = true; +	bool relock = false;  	ssize_t ret; -	int writing = rw & WRITE; -	int write_bits = 0; -	size_t count = iov_length(iov, nr_segs); -	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, -			    offset, nr_segs)) { +	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))  		return 0; -	} -	lockstart = offset; -	lockend = offset + count - 1; +	atomic_inc(&inode->i_dio_count); +	smp_mb__after_atomic(); -	if (writing) { -		ret = btrfs_delalloc_reserve_space(inode, count); -		if (ret) -			goto out; -	} +	/* +	 * The generic stuff only does filemap_write_and_wait_range, which +	 * isn't enough if we've written compressed pages to this area, so +	 * we need to flush the dirty pages again to make absolutely sure +	 * that any outstanding dirty pages are on disk. +	 */ +	count = iov_iter_count(iter); +	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +		     &BTRFS_I(inode)->runtime_flags)) +		filemap_fdatawrite_range(inode->i_mapping, offset, count); -	while (1) { -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, -				 0, &cached_state, GFP_NOFS); +	if (rw & WRITE) {  		/* -		 * We're concerned with the entire range that we're going to be -		 * doing DIO to, so we need to make sure theres no ordered -		 * extents in this range. +		 * If the write DIO is beyond the EOF, we need update +		 * the isize, but it is protected by i_mutex. So we can +		 * not unlock the i_mutex at this case.  		 */ -		ordered = btrfs_lookup_ordered_range(inode, lockstart, -						     lockend - lockstart + 1); -		if (!ordered) -			break; -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, -				     &cached_state, GFP_NOFS); -		btrfs_start_ordered_extent(inode, ordered, 1); -		btrfs_put_ordered_extent(ordered); -		cond_resched(); -	} - -	/* -	 * we don't use btrfs_set_extent_delalloc because we don't want -	 * the dirty or uptodate bits -	 */ -	if (writing) { -		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; -		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, -				     EXTENT_DELALLOC, 0, NULL, &cached_state, -				     GFP_NOFS); -		if (ret) { -			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -					 lockend, EXTENT_LOCKED | write_bits, -					 1, 0, &cached_state, GFP_NOFS); -			goto out; +		if (offset + count <= inode->i_size) { +			mutex_unlock(&inode->i_mutex); +			relock = true;  		} +		ret = btrfs_delalloc_reserve_space(inode, count); +		if (ret) +			goto out; +	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, +				     &BTRFS_I(inode)->runtime_flags))) { +		inode_dio_done(inode); +		flags = DIO_LOCKING | DIO_SKIP_HOLES; +		wakeup = false;  	} -	free_extent_state(cached_state); -	cached_state = NULL; -  	ret = __blockdev_direct_IO(rw, iocb, inode, -		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, -		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, -		   btrfs_submit_direct, 0); - -	if (ret < 0 && ret != -EIOCBQUEUED) { -		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, -			      offset + iov_length(iov, nr_segs) - 1, -			      EXTENT_LOCKED | write_bits, 1, 0, -			      &cached_state, GFP_NOFS); -	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { -		/* -		 * We're falling back to buffered, unlock the section we didn't -		 * do IO on. -		 */ -		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, -			      offset + iov_length(iov, nr_segs) - 1, -			      EXTENT_LOCKED | write_bits, 1, 0, -			      &cached_state, GFP_NOFS); +			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, +			iter, offset, btrfs_get_blocks_direct, NULL, +			btrfs_submit_direct, flags); +	if (rw & WRITE) { +		if (ret < 0 && ret != -EIOCBQUEUED) +			btrfs_delalloc_release_space(inode, count); +		else if (ret >= 0 && (size_t)ret < count) +			btrfs_delalloc_release_space(inode, +						     count - (size_t)ret); +		else +			btrfs_delalloc_release_metadata(inode, 0);  	}  out: -	free_extent_state(cached_state); +	if (wakeup) +		inode_dio_done(inode); +	if (relock) +		mutex_lock(&inode->i_mutex); +  	return ret;  } +#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC) +  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  		__u64 start, __u64 len)  { -	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); +	int	ret; + +	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); +	if (ret) +		return ret; + +	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);  }  int btrfs_readpage(struct file *file, struct page *page)  {  	struct extent_io_tree *tree;  	tree = &BTRFS_I(page->mapping->host)->io_tree; -	return extent_read_full_page(tree, page, btrfs_get_extent); +	return extent_read_full_page(tree, page, btrfs_get_extent, 0);  }  static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -5917,8 +7601,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)  	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);  } -int btrfs_writepages(struct address_space *mapping, -		     struct writeback_control *wbc) +static int btrfs_writepages(struct address_space *mapping, +			    struct writeback_control *wbc)  {  	struct extent_io_tree *tree; @@ -5959,14 +7643,16 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)  	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);  } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, +				 unsigned int length)  { +	struct inode *inode = page->mapping->host;  	struct extent_io_tree *tree;  	struct btrfs_ordered_extent *ordered;  	struct extent_state *cached_state = NULL;  	u64 page_start = page_offset(page);  	u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - +	int inode_evicting = inode->i_state & I_FREEING;  	/*  	 * we have the page locked, so new writeback can't start, @@ -5977,41 +7663,65 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  	 */  	wait_on_page_writeback(page); -	tree = &BTRFS_I(page->mapping->host)->io_tree; +	tree = &BTRFS_I(inode)->io_tree;  	if (offset) {  		btrfs_releasepage(page, GFP_NOFS);  		return;  	} -	lock_extent_bits(tree, page_start, page_end, 0, &cached_state, -			 GFP_NOFS); -	ordered = btrfs_lookup_ordered_extent(page->mapping->host, -					   page_offset(page)); + +	if (!inode_evicting) +		lock_extent_bits(tree, page_start, page_end, 0, &cached_state); +	ordered = btrfs_lookup_ordered_extent(inode, page_start);  	if (ordered) {  		/*  		 * IO on this page will never be started, so we need  		 * to account for any ordered extents now  		 */ -		clear_extent_bit(tree, page_start, page_end, -				 EXTENT_DIRTY | EXTENT_DELALLOC | -				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, -				 &cached_state, GFP_NOFS); +		if (!inode_evicting) +			clear_extent_bit(tree, page_start, page_end, +					 EXTENT_DIRTY | EXTENT_DELALLOC | +					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | +					 EXTENT_DEFRAG, 1, 0, &cached_state, +					 GFP_NOFS);  		/*  		 * whoever cleared the private bit is responsible  		 * for the finish_ordered_io  		 */  		if (TestClearPagePrivate2(page)) { -			btrfs_finish_ordered_io(page->mapping->host, -						page_start, page_end); +			struct btrfs_ordered_inode_tree *tree; +			u64 new_len; + +			tree = &BTRFS_I(inode)->ordered_tree; + +			spin_lock_irq(&tree->lock); +			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); +			new_len = page_start - ordered->file_offset; +			if (new_len < ordered->truncated_len) +				ordered->truncated_len = new_len; +			spin_unlock_irq(&tree->lock); + +			if (btrfs_dec_test_ordered_pending(inode, &ordered, +							   page_start, +							   PAGE_CACHE_SIZE, 1)) +				btrfs_finish_ordered_io(ordered);  		}  		btrfs_put_ordered_extent(ordered); -		cached_state = NULL; -		lock_extent_bits(tree, page_start, page_end, 0, &cached_state, -				 GFP_NOFS); +		if (!inode_evicting) { +			cached_state = NULL; +			lock_extent_bits(tree, page_start, page_end, 0, +					 &cached_state); +		} +	} + +	if (!inode_evicting) { +		clear_extent_bit(tree, page_start, page_end, +				 EXTENT_LOCKED | EXTENT_DIRTY | +				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | +				 EXTENT_DEFRAG, 1, 1, +				 &cached_state, GFP_NOFS); + +		__btrfs_releasepage(page, GFP_NOFS);  	} -	clear_extent_bit(tree, page_start, page_end, -		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | -		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); -	__btrfs_releasepage(page, GFP_NOFS);  	ClearPageChecked(page);  	if (PagePrivate(page)) { @@ -6039,7 +7749,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	struct inode *inode = fdentry(vma->vm_file)->d_inode; +	struct inode *inode = file_inode(vma->vm_file);  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct btrfs_ordered_extent *ordered; @@ -6048,16 +7758,24 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	unsigned long zero_start;  	loff_t size;  	int ret; +	int reserved = 0;  	u64 page_start;  	u64 page_end; +	sb_start_pagefault(inode->i_sb);  	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	if (!ret) { +		ret = file_update_time(vma->vm_file); +		reserved = 1; +	}  	if (ret) {  		if (ret == -ENOMEM)  			ret = VM_FAULT_OOM;  		else /* -ENOSPC, -EIO, etc */  			ret = VM_FAULT_SIGBUS; -		goto out; +		if (reserved) +			goto out; +		goto out_noreserve;  	}  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -6074,8 +7792,7 @@ again:  	}  	wait_on_page_writeback(page); -	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, -			 GFP_NOFS); +	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);  	set_page_extent_mapped(page);  	/* @@ -6100,7 +7817,8 @@ again:  	 * prepare_pages in the normal write path.  	 */  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, -			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, +			  EXTENT_DIRTY | EXTENT_DELALLOC | +			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			  0, 0, &cached_state, GFP_NOFS);  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -6131,42 +7849,94 @@ again:  	BTRFS_I(inode)->last_trans = root->fs_info->generation;  	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; +	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;  	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);  out_unlock: -	if (!ret) +	if (!ret) { +		sb_end_pagefault(inode->i_sb);  		return VM_FAULT_LOCKED; +	}  	unlock_page(page); -	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);  out: +	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: +	sb_end_pagefault(inode->i_sb);  	return ret;  } -static void btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	int ret; +	struct btrfs_block_rsv *rsv; +	int ret = 0; +	int err = 0;  	struct btrfs_trans_handle *trans; -	unsigned long nr;  	u64 mask = root->sectorsize - 1; +	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); -	if (!S_ISREG(inode->i_mode)) { -		WARN_ON(1); -		return; -	} - -	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); +	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), +				       (u64)-1);  	if (ret) -		return; +		return ret; + +	/* +	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have +	 * 3 things going on here +	 * +	 * 1) We need to reserve space for our orphan item and the space to +	 * delete our orphan item.  Lord knows we don't want to have a dangling +	 * orphan item because we didn't reserve space to remove it. +	 * +	 * 2) We need to reserve space to update our inode. +	 * +	 * 3) We need to have something to cache all the space that is going to +	 * be free'd up by the truncate operation, but also have some slack +	 * space reserved in case it uses space during the truncate (thank you +	 * very much snapshotting). +	 * +	 * And we need these to all be seperate.  The fact is we can use alot of +	 * space doing the truncate, and we have no earthly idea how much space +	 * we will use, so we need the truncate reservation to be seperate so it +	 * doesn't end up using space reserved for updating the inode or +	 * removing the orphan item.  We also need to be able to stop the +	 * transaction and start a new one, which means we need to be able to +	 * update the inode several times, and we have no idea of knowing how +	 * many times that will be, so we can't just reserve 1 item for the +	 * entirety of the opration, so that has to be done seperately as well. +	 * Then there is the orphan item, which does indeed need to be held on +	 * to for the whole operation, and we need nobody to touch this reserved +	 * space except the orphan code. +	 * +	 * So that leaves us with +	 * +	 * 1) root->orphan_block_rsv - for the orphan deletion. +	 * 2) rsv - for the truncate reservation, which we will steal from the +	 * transaction reservation. +	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for +	 * updating the inode. +	 */ +	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); +	if (!rsv) +		return -ENOMEM; +	rsv->size = min_size; +	rsv->failfast = 1; -	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); -	btrfs_ordered_update_i_size(inode, inode->i_size, NULL); +	/* +	 * 1 for the truncate slack space +	 * 1 for updating the inode. +	 */ +	trans = btrfs_start_transaction(root, 2); +	if (IS_ERR(trans)) { +		err = PTR_ERR(trans); +		goto out; +	} -	trans = btrfs_start_transaction(root, 0); -	BUG_ON(IS_ERR(trans)); -	btrfs_set_trans_block_group(trans, inode); -	trans->block_rsv = root->orphan_block_rsv; +	/* Migrate the slack space for the truncate to our reserve */ +	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, +				      min_size); +	BUG_ON(ret);  	/*  	 * setattr is responsible for setting the ordered_data_close flag, @@ -6185,54 +7955,76 @@ static void btrfs_truncate(struct inode *inode)  	 * using truncate to replace the contents of the file will  	 * end up with a zero length file after a crash.  	 */ -	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) +	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, +					   &BTRFS_I(inode)->runtime_flags))  		btrfs_add_ordered_operation(trans, root, inode); -	while (1) { -		if (!trans) { -			trans = btrfs_start_transaction(root, 0); -			BUG_ON(IS_ERR(trans)); -			btrfs_set_trans_block_group(trans, inode); -			trans->block_rsv = root->orphan_block_rsv; -		} - -		ret = btrfs_block_rsv_check(trans, root, -					    root->orphan_block_rsv, 0, 5); -		if (ret) { -			BUG_ON(ret != -EAGAIN); -			ret = btrfs_commit_transaction(trans, root); -			BUG_ON(ret); -			trans = NULL; -			continue; -		} +	/* +	 * So if we truncate and then write and fsync we normally would just +	 * write the extents that changed, which is a problem if we need to +	 * first truncate that entire inode.  So set this flag so we write out +	 * all of the extents in the inode to the sync log so we're completely +	 * safe. +	 */ +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); +	trans->block_rsv = rsv; +	while (1) {  		ret = btrfs_truncate_inode_items(trans, root, inode,  						 inode->i_size,  						 BTRFS_EXTENT_DATA_KEY); -		if (ret != -EAGAIN) +		if (ret != -ENOSPC) { +			err = ret;  			break; +		} +		trans->block_rsv = &root->fs_info->trans_block_rsv;  		ret = btrfs_update_inode(trans, root, inode); -		BUG_ON(ret); +		if (ret) { +			err = ret; +			break; +		} -		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root); -		trans = NULL; -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root); + +		trans = btrfs_start_transaction(root, 2); +		if (IS_ERR(trans)) { +			ret = err = PTR_ERR(trans); +			trans = NULL; +			break; +		} + +		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, +					      rsv, min_size); +		BUG_ON(ret);	/* shouldn't happen */ +		trans->block_rsv = rsv;  	}  	if (ret == 0 && inode->i_nlink > 0) { +		trans->block_rsv = root->orphan_block_rsv;  		ret = btrfs_orphan_del(trans, inode); -		BUG_ON(ret); +		if (ret) +			err = ret;  	} -	ret = btrfs_update_inode(trans, root, inode); -	BUG_ON(ret); +	if (trans) { +		trans->block_rsv = &root->fs_info->trans_block_rsv; +		ret = btrfs_update_inode(trans, root, inode); +		if (ret && !err) +			err = ret; -	nr = trans->blocks_used; -	ret = btrfs_end_transaction_throttle(trans, root); -	BUG_ON(ret); -	btrfs_btree_balance_dirty(root, nr); +		ret = btrfs_end_transaction(trans, root); +		btrfs_btree_balance_dirty(root); +	} + +out: +	btrfs_free_block_rsv(root, rsv); + +	if (ret && !err) +		err = ret; + +	return err;  }  /* @@ -6240,40 +8032,35 @@ static void btrfs_truncate(struct inode *inode)   */  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  			     struct btrfs_root *new_root, -			     u64 new_dirid, u64 alloc_hint) +			     struct btrfs_root *parent_root, +			     u64 new_dirid)  {  	struct inode *inode;  	int err;  	u64 index = 0; -	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, -				new_dirid, alloc_hint, S_IFDIR | 0700, &index); +	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, +				new_dirid, new_dirid, +				S_IFDIR | (~current_umask() & S_IRWXUGO), +				&index);  	if (IS_ERR(inode))  		return PTR_ERR(inode);  	inode->i_op = &btrfs_dir_inode_operations;  	inode->i_fop = &btrfs_dir_file_operations; -	inode->i_nlink = 1; +	set_nlink(inode, 1);  	btrfs_i_size_write(inode, 0); +	err = btrfs_subvol_inherit_props(trans, new_root, parent_root); +	if (err) +		btrfs_err(new_root->fs_info, +			  "error inheriting subvolume %llu properties: %d", +			  new_root->root_key.objectid, err); +  	err = btrfs_update_inode(trans, new_root, inode); -	BUG_ON(err);  	iput(inode); -	return 0; -} - -/* helper function for file defrag and space balancing.  This - * forces readahead on a given range of bytes in an inode - */ -unsigned long btrfs_force_ra(struct address_space *mapping, -			      struct file_ra_state *ra, struct file *file, -			      pgoff_t offset, pgoff_t last_index) -{ -	pgoff_t req_size = last_index - offset + 1; - -	page_cache_sync_readahead(mapping, ra, file, offset, req_size); -	return offset + req_size; +	return err;  }  struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -6286,35 +8073,38 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  		return NULL;  	ei->root = NULL; -	ei->space_info = NULL;  	ei->generation = 0; -	ei->sequence = 0;  	ei->last_trans = 0;  	ei->last_sub_trans = 0;  	ei->logged_trans = 0;  	ei->delalloc_bytes = 0; -	ei->reserved_bytes = 0;  	ei->disk_i_size = 0;  	ei->flags = 0; +	ei->csum_bytes = 0;  	ei->index_cnt = (u64)-1; +	ei->dir_index = 0;  	ei->last_unlink_trans = 0; +	ei->last_log_commit = 0; -	spin_lock_init(&ei->accounting_lock); -	atomic_set(&ei->outstanding_extents, 0); +	spin_lock_init(&ei->lock); +	ei->outstanding_extents = 0;  	ei->reserved_extents = 0; -	ei->ordered_data_close = 0; -	ei->orphan_meta_reserved = 0; -	ei->dummy_inode = 0; -	ei->force_compress = 0; +	ei->runtime_flags = 0; +	ei->force_compress = BTRFS_COMPRESS_NONE; + +	ei->delayed_node = NULL;  	inode = &ei->vfs_inode; -	extent_map_tree_init(&ei->extent_tree, GFP_NOFS); -	extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); -	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); +	extent_map_tree_init(&ei->extent_tree); +	extent_io_tree_init(&ei->io_tree, &inode->i_data); +	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); +	ei->io_tree.track_uptodate = 1; +	ei->io_failure_tree.track_uptodate = 1; +	atomic_set(&ei->sync_writers, 0);  	mutex_init(&ei->log_mutex); +	mutex_init(&ei->delalloc_mutex);  	btrfs_ordered_inode_tree_init(&ei->ordered_tree); -	INIT_LIST_HEAD(&ei->i_orphan);  	INIT_LIST_HEAD(&ei->delalloc_inodes);  	INIT_LIST_HEAD(&ei->ordered_operations);  	RB_CLEAR_NODE(&ei->rb_node); @@ -6322,15 +8112,31 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	return inode;  } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ +	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + +static void btrfs_i_callback(struct rcu_head *head) +{ +	struct inode *inode = container_of(head, struct inode, i_rcu); +	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +  void btrfs_destroy_inode(struct inode *inode)  {  	struct btrfs_ordered_extent *ordered;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	WARN_ON(!list_empty(&inode->i_dentry)); +	WARN_ON(!hlist_empty(&inode->i_dentry));  	WARN_ON(inode->i_data.nrpages); -	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); +	WARN_ON(BTRFS_I(inode)->outstanding_extents);  	WARN_ON(BTRFS_I(inode)->reserved_extents); +	WARN_ON(BTRFS_I(inode)->delalloc_bytes); +	WARN_ON(BTRFS_I(inode)->csum_bytes);  	/*  	 * This can happen where we create an inode, but somebody else also @@ -6346,43 +8152,25 @@ void btrfs_destroy_inode(struct inode *inode)  	 */  	smp_mb();  	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { -		spin_lock(&root->fs_info->ordered_extent_lock); +		spin_lock(&root->fs_info->ordered_root_lock);  		list_del_init(&BTRFS_I(inode)->ordered_operations); -		spin_unlock(&root->fs_info->ordered_extent_lock); -	} - -	if (root == root->fs_info->tree_root) { -		struct btrfs_block_group_cache *block_group; - -		block_group = btrfs_lookup_block_group(root->fs_info, -						BTRFS_I(inode)->block_group); -		if (block_group && block_group->inode == inode) { -			spin_lock(&block_group->lock); -			block_group->inode = NULL; -			spin_unlock(&block_group->lock); -			btrfs_put_block_group(block_group); -		} else if (block_group) { -			btrfs_put_block_group(block_group); -		} +		spin_unlock(&root->fs_info->ordered_root_lock);  	} -	spin_lock(&root->orphan_lock); -	if (!list_empty(&BTRFS_I(inode)->i_orphan)) { -		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", -		       inode->i_ino); -		list_del_init(&BTRFS_I(inode)->i_orphan); +	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, +		     &BTRFS_I(inode)->runtime_flags)) { +		btrfs_info(root->fs_info, "inode %llu still on the orphan list", +			btrfs_ino(inode)); +		atomic_dec(&root->orphan_inodes);  	} -	spin_unlock(&root->orphan_lock);  	while (1) {  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);  		if (!ordered)  			break;  		else { -			printk(KERN_ERR "btrfs found ordered " -			       "extent %llu %llu on inode cleanup\n", -			       (unsigned long long)ordered->file_offset, -			       (unsigned long long)ordered->len); +			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", +				ordered->file_offset, ordered->len);  			btrfs_remove_ordered_extent(inode, ordered);  			btrfs_put_ordered_extent(ordered);  			btrfs_put_ordered_extent(ordered); @@ -6391,15 +8179,18 @@ void btrfs_destroy_inode(struct inode *inode)  	inode_tree_del(inode);  	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);  free: -	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +	call_rcu(&inode->i_rcu, btrfs_i_callback);  }  int btrfs_drop_inode(struct inode *inode)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	if (btrfs_root_refs(&root->root_item) == 0 && -	    root != root->fs_info->tree_root) +	if (root == NULL) +		return 1; + +	/* the snap/subvol tree is on deleting */ +	if (btrfs_root_refs(&root->root_item) == 0)  		return 1;  	else  		return generic_drop_inode(inode); @@ -6414,6 +8205,11 @@ static void init_once(void *foo)  void btrfs_destroy_cachep(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	if (btrfs_inode_cachep)  		kmem_cache_destroy(btrfs_inode_cachep);  	if (btrfs_trans_handle_cachep) @@ -6422,34 +8218,51 @@ void btrfs_destroy_cachep(void)  		kmem_cache_destroy(btrfs_transaction_cachep);  	if (btrfs_path_cachep)  		kmem_cache_destroy(btrfs_path_cachep); +	if (btrfs_free_space_cachep) +		kmem_cache_destroy(btrfs_free_space_cachep); +	if (btrfs_delalloc_work_cachep) +		kmem_cache_destroy(btrfs_delalloc_work_cachep);  }  int btrfs_init_cachep(void)  { -	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", +	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",  			sizeof(struct btrfs_inode), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);  	if (!btrfs_inode_cachep)  		goto fail; -	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", +	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",  			sizeof(struct btrfs_trans_handle), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_trans_handle_cachep)  		goto fail; -	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", +	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",  			sizeof(struct btrfs_transaction), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_transaction_cachep)  		goto fail; -	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", +	btrfs_path_cachep = kmem_cache_create("btrfs_path",  			sizeof(struct btrfs_path), 0,  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);  	if (!btrfs_path_cachep)  		goto fail; +	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", +			sizeof(struct btrfs_free_space), 0, +			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); +	if (!btrfs_free_space_cachep) +		goto fail; + +	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", +			sizeof(struct btrfs_delalloc_work), 0, +			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, +			NULL); +	if (!btrfs_delalloc_work_cachep) +		goto fail; +  	return 0;  fail:  	btrfs_destroy_cachep(); @@ -6459,12 +8272,19 @@ fail:  static int btrfs_getattr(struct vfsmount *mnt,  			 struct dentry *dentry, struct kstat *stat)  { +	u64 delalloc_bytes;  	struct inode *inode = dentry->d_inode; +	u32 blocksize = inode->i_sb->s_blocksize; +  	generic_fillattr(inode, stat); -	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; +	stat->dev = BTRFS_I(inode)->root->anon_dev;  	stat->blksize = PAGE_CACHE_SIZE; -	stat->blocks = (inode_get_bytes(inode) + -			BTRFS_I(inode)->delalloc_bytes) >> 9; + +	spin_lock(&BTRFS_I(inode)->lock); +	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; +	spin_unlock(&BTRFS_I(inode)->lock); +	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + +			ALIGN(delalloc_bytes, blocksize)) >> 9;  	return 0;  } @@ -6480,21 +8300,43 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	u64 index = 0;  	u64 root_objectid;  	int ret; +	u64 old_ino = btrfs_ino(old_inode); -	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) +	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)  		return -EPERM;  	/* we only allow rename subvolume link between subvolumes */ -	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) +	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)  		return -EXDEV; -	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || -	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) +	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || +	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))  		return -ENOTEMPTY;  	if (S_ISDIR(old_inode->i_mode) && new_inode &&  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)  		return -ENOTEMPTY; + + +	/* check for collisions, even if the  name isn't there */ +	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, +			     new_dentry->d_name.name, +			     new_dentry->d_name.len); + +	if (ret) { +		if (ret == -EEXIST) { +			/* we shouldn't get +			 * eexist without a new_inode */ +			if (WARN_ON(!new_inode)) { +				return ret; +			} +		} else { +			/* maybe -EOVERFLOW */ +			return ret; +		} +	} +	ret = 0; +  	/*  	 * we're using rename to replace one file with another.  	 * and the replacement file is large.  Start IO on it now so @@ -6505,7 +8347,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		filemap_flush(old_inode->i_mapping);  	/* close the racy window with snapshot create/destroy ioctl */ -	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)  		down_read(&root->fs_info->subvol_sem);  	/*  	 * We want to reserve the absolute worst case amount of items.  So if @@ -6515,11 +8357,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items  	 * should cover the worst case number of items we'll modify.  	 */ -	trans = btrfs_start_transaction(root, 20); -	if (IS_ERR(trans)) -		return PTR_ERR(trans); - -	btrfs_set_trans_block_group(trans, new_dir); +	trans = btrfs_start_transaction(root, 11); +	if (IS_ERR(trans)) { +                ret = PTR_ERR(trans); +                goto out_notrans; +        }  	if (dest != root)  		btrfs_record_root_in_trans(trans, dest); @@ -6528,15 +8370,16 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (ret)  		goto out_fail; -	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { +	BTRFS_I(old_inode)->dir_index = 0ULL; +	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		/* force full log commit if subvolume involved. */ -		root->fs_info->last_trans_log_full_commit = trans->transid; +		btrfs_set_log_full_commit(root->fs_info, trans);  	} else {  		ret = btrfs_insert_inode_ref(trans, dest,  					     new_dentry->d_name.name,  					     new_dentry->d_name.len, -					     old_inode->i_ino, -					     new_dir->i_ino, index); +					     old_ino, +					     btrfs_ino(new_dir), index);  		if (ret)  			goto out_fail;  		/* @@ -6552,11 +8395,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	 * make sure the inode gets flushed if it is replacing  	 * something.  	 */ -	if (new_inode && new_inode->i_size && -	    old_inode && S_ISREG(old_inode->i_mode)) { +	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))  		btrfs_add_ordered_operation(trans, root, old_inode); -	} +	inode_inc_iversion(old_dir); +	inode_inc_iversion(new_dir); +	inode_inc_iversion(old_inode);  	old_dir->i_ctime = old_dir->i_mtime = ctime;  	new_dir->i_ctime = new_dir->i_mtime = ctime;  	old_inode->i_ctime = ctime; @@ -6564,23 +8408,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (old_dentry->d_parent != new_dentry->d_parent)  		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); -	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { +	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;  		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,  					old_dentry->d_name.name,  					old_dentry->d_name.len);  	} else { -		btrfs_inc_nlink(old_dentry->d_inode); -		ret = btrfs_unlink_inode(trans, root, old_dir, -					 old_dentry->d_inode, -					 old_dentry->d_name.name, -					 old_dentry->d_name.len); +		ret = __btrfs_unlink_inode(trans, root, old_dir, +					old_dentry->d_inode, +					old_dentry->d_name.name, +					old_dentry->d_name.len); +		if (!ret) +			ret = btrfs_update_inode(trans, root, old_inode); +	} +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail;  	} -	BUG_ON(ret);  	if (new_inode) { +		inode_inc_iversion(new_inode);  		new_inode->i_ctime = CURRENT_TIME; -		if (unlikely(new_inode->i_ino == +		if (unlikely(btrfs_ino(new_inode) ==  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {  			root_objectid = BTRFS_I(new_inode)->location.objectid;  			ret = btrfs_unlink_subvol(trans, dest, new_dir, @@ -6594,66 +8443,168 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  						 new_dentry->d_name.name,  						 new_dentry->d_name.len);  		} -		BUG_ON(ret); -		if (new_inode->i_nlink == 0) { +		if (!ret && new_inode->i_nlink == 0)  			ret = btrfs_orphan_add(trans, new_dentry->d_inode); -			BUG_ON(ret); +		if (ret) { +			btrfs_abort_transaction(trans, root, ret); +			goto out_fail;  		}  	}  	ret = btrfs_add_link(trans, new_dir, old_inode,  			     new_dentry->d_name.name,  			     new_dentry->d_name.len, 0, index); -	BUG_ON(ret); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto out_fail; +	} + +	if (old_inode->i_nlink == 1) +		BTRFS_I(old_inode)->dir_index = index; -	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { -		btrfs_log_new_name(trans, old_inode, old_dir, -				   new_dentry->d_parent); +	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { +		struct dentry *parent = new_dentry->d_parent; +		btrfs_log_new_name(trans, old_inode, old_dir, parent);  		btrfs_end_log_trans(root);  	}  out_fail: -	btrfs_end_transaction_throttle(trans, root); - -	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) +	btrfs_end_transaction(trans, root); +out_notrans: +	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)  		up_read(&root->fs_info->subvol_sem);  	return ret;  } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ +	struct btrfs_delalloc_work *delalloc_work; +	struct inode *inode; + +	delalloc_work = container_of(work, struct btrfs_delalloc_work, +				     work); +	inode = delalloc_work->inode; +	if (delalloc_work->wait) { +		btrfs_wait_ordered_range(inode, 0, (u64)-1); +	} else { +		filemap_flush(inode->i_mapping); +		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +			     &BTRFS_I(inode)->runtime_flags)) +			filemap_flush(inode->i_mapping); +	} + +	if (delalloc_work->delay_iput) +		btrfs_add_delayed_iput(inode); +	else +		iput(inode); +	complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, +						    int wait, int delay_iput) +{ +	struct btrfs_delalloc_work *work; + +	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); +	if (!work) +		return NULL; + +	init_completion(&work->completion); +	INIT_LIST_HEAD(&work->list); +	work->inode = inode; +	work->wait = wait; +	work->delay_iput = delay_iput; +	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + +	return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ +	wait_for_completion(&work->completion); +	kmem_cache_free(btrfs_delalloc_work_cachep, work); +} +  /*   * some fairly slow code that needs optimization. This walks the list   * of all the inodes with pending delalloc and forces them to disk.   */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, +				   int nr)  { -	struct list_head *head = &root->fs_info->delalloc_inodes;  	struct btrfs_inode *binode;  	struct inode *inode; +	struct btrfs_delalloc_work *work, *next; +	struct list_head works; +	struct list_head splice; +	int ret = 0; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	INIT_LIST_HEAD(&works); +	INIT_LIST_HEAD(&splice); -	spin_lock(&root->fs_info->delalloc_lock); -	while (!list_empty(head)) { -		binode = list_entry(head->next, struct btrfs_inode, +	mutex_lock(&root->delalloc_mutex); +	spin_lock(&root->delalloc_lock); +	list_splice_init(&root->delalloc_inodes, &splice); +	while (!list_empty(&splice)) { +		binode = list_entry(splice.next, struct btrfs_inode,  				    delalloc_inodes); + +		list_move_tail(&binode->delalloc_inodes, +			       &root->delalloc_inodes);  		inode = igrab(&binode->vfs_inode); -		if (!inode) -			list_del_init(&binode->delalloc_inodes); -		spin_unlock(&root->fs_info->delalloc_lock); -		if (inode) { -			filemap_flush(inode->i_mapping); +		if (!inode) { +			cond_resched_lock(&root->delalloc_lock); +			continue; +		} +		spin_unlock(&root->delalloc_lock); + +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); +		if (unlikely(!work)) {  			if (delay_iput)  				btrfs_add_delayed_iput(inode);  			else  				iput(inode); +			ret = -ENOMEM; +			goto out;  		} +		list_add_tail(&work->list, &works); +		btrfs_queue_work(root->fs_info->flush_workers, +				 &work->work); +		ret++; +		if (nr != -1 && ret >= nr) +			goto out;  		cond_resched(); -		spin_lock(&root->fs_info->delalloc_lock); +		spin_lock(&root->delalloc_lock);  	} -	spin_unlock(&root->fs_info->delalloc_lock); +	spin_unlock(&root->delalloc_lock); -	/* the filemap_flush will queue IO into the worker threads, but +out: +	list_for_each_entry_safe(work, next, &works, list) { +		list_del_init(&work->list); +		btrfs_wait_and_free_delalloc_work(work); +	} + +	if (!list_empty_careful(&splice)) { +		spin_lock(&root->delalloc_lock); +		list_splice_tail(&splice, &root->delalloc_inodes); +		spin_unlock(&root->delalloc_lock); +	} +	mutex_unlock(&root->delalloc_mutex); +	return ret; +} + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ +	int ret; + +	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) +		return -EROFS; + +	ret = __start_delalloc_inodes(root, delay_iput, -1); +	if (ret > 0) +		ret = 0; +	/* +	 * the filemap_flush will queue IO into the worker threads, but  	 * we have to make sure the IO is actually started and that  	 * ordered extents get created before we return  	 */ @@ -6665,59 +8616,63 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));  	}  	atomic_dec(&root->fs_info->async_submit_draining); -	return 0; +	return ret;  } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, -				   int sync) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, +			       int nr)  { -	struct btrfs_inode *binode; -	struct inode *inode = NULL; +	struct btrfs_root *root; +	struct list_head splice; +	int ret; -	spin_lock(&root->fs_info->delalloc_lock); -	while (!list_empty(&root->fs_info->delalloc_inodes)) { -		binode = list_entry(root->fs_info->delalloc_inodes.next, -				    struct btrfs_inode, delalloc_inodes); -		inode = igrab(&binode->vfs_inode); -		if (inode) { -			list_move_tail(&binode->delalloc_inodes, -				       &root->fs_info->delalloc_inodes); -			break; -		} +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) +		return -EROFS; -		list_del_init(&binode->delalloc_inodes); -		cond_resched_lock(&root->fs_info->delalloc_lock); -	} -	spin_unlock(&root->fs_info->delalloc_lock); +	INIT_LIST_HEAD(&splice); + +	mutex_lock(&fs_info->delalloc_root_mutex); +	spin_lock(&fs_info->delalloc_root_lock); +	list_splice_init(&fs_info->delalloc_roots, &splice); +	while (!list_empty(&splice) && nr) { +		root = list_first_entry(&splice, struct btrfs_root, +					delalloc_root); +		root = btrfs_grab_fs_root(root); +		BUG_ON(!root); +		list_move_tail(&root->delalloc_root, +			       &fs_info->delalloc_roots); +		spin_unlock(&fs_info->delalloc_root_lock); + +		ret = __start_delalloc_inodes(root, delay_iput, nr); +		btrfs_put_fs_root(root); +		if (ret < 0) +			goto out; -	if (inode) { -		if (sync) { -			filemap_write_and_wait(inode->i_mapping); -			/* -			 * We have to do this because compression doesn't -			 * actually set PG_writeback until it submits the pages -			 * for IO, which happens in an async thread, so we could -			 * race and not actually wait for any writeback pages -			 * because they've not been submitted yet.  Technically -			 * this could still be the case for the ordered stuff -			 * since the async thread may not have started to do its -			 * work yet.  If this becomes the case then we need to -			 * figure out a way to make sure that in writepage we -			 * wait for any async pages to be submitted before -			 * returning so that fdatawait does what its supposed to -			 * do. -			 */ -			btrfs_wait_ordered_range(inode, 0, (u64)-1); -		} else { -			filemap_flush(inode->i_mapping); +		if (nr != -1) { +			nr -= ret; +			WARN_ON(nr < 0);  		} -		if (delay_iput) -			btrfs_add_delayed_iput(inode); -		else -			iput(inode); -		return 1; +		spin_lock(&fs_info->delalloc_root_lock);  	} -	return 0; +	spin_unlock(&fs_info->delalloc_root_lock); + +	ret = 0; +	atomic_inc(&fs_info->async_submit_draining); +	while (atomic_read(&fs_info->nr_async_submits) || +	      atomic_read(&fs_info->async_delalloc_pages)) { +		wait_event(fs_info->async_submit_wait, +		   (atomic_read(&fs_info->nr_async_submits) == 0 && +		    atomic_read(&fs_info->async_delalloc_pages) == 0)); +	} +	atomic_dec(&fs_info->async_submit_draining); +out: +	if (!list_empty_careful(&splice)) { +		spin_lock(&fs_info->delalloc_root_lock); +		list_splice_tail(&splice, &fs_info->delalloc_roots); +		spin_unlock(&fs_info->delalloc_root_lock); +	} +	mutex_unlock(&fs_info->delalloc_root_mutex); +	return ret;  }  static int btrfs_symlink(struct inode *dir, struct dentry *dentry, @@ -6731,21 +8686,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  	int err;  	int drop_inode = 0;  	u64 objectid; -	u64 index = 0 ; +	u64 index = 0;  	int name_len;  	int datasize;  	unsigned long ptr;  	struct btrfs_file_extent_item *ei;  	struct extent_buffer *leaf; -	unsigned long nr = 0; -	name_len = strlen(symname) + 1; +	name_len = strlen(symname);  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))  		return -ENAMETOOLONG; -	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); -	if (err) -		return err;  	/*  	 * 2 items for inode item and ref  	 * 2 items for dir items @@ -6755,42 +8706,51 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  	if (IS_ERR(trans))  		return PTR_ERR(trans); -	btrfs_set_trans_block_group(trans, dir); +	err = btrfs_find_free_ino(root, &objectid); +	if (err) +		goto out_unlock;  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, -				dentry->d_name.len, -				dentry->d_parent->d_inode->i_ino, objectid, -				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, -				&index); -	err = PTR_ERR(inode); -	if (IS_ERR(inode)) +				dentry->d_name.len, btrfs_ino(dir), objectid, +				S_IFLNK|S_IRWXUGO, &index); +	if (IS_ERR(inode)) { +		err = PTR_ERR(inode);  		goto out_unlock; +	} -	err = btrfs_init_inode_security(trans, inode, dir); +	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);  	if (err) {  		drop_inode = 1;  		goto out_unlock;  	} -	btrfs_set_trans_block_group(trans, inode); -	err = btrfs_add_nondir(trans, dentry, inode, 0, index); +	/* +	* If the active LSM wants to access the inode during +	* d_instantiate it needs these. Smack checks to see +	* if the filesystem supports xattrs by looking at the +	* ops vector. +	*/ +	inode->i_fop = &btrfs_file_operations; +	inode->i_op = &btrfs_file_inode_operations; + +	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);  	if (err)  		drop_inode = 1;  	else {  		inode->i_mapping->a_ops = &btrfs_aops;  		inode->i_mapping->backing_dev_info = &root->fs_info->bdi; -		inode->i_fop = &btrfs_file_operations; -		inode->i_op = &btrfs_file_inode_operations;  		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;  	} -	btrfs_update_inode_block_group(trans, inode); -	btrfs_update_inode_block_group(trans, dir);  	if (drop_inode)  		goto out_unlock;  	path = btrfs_alloc_path(); -	BUG_ON(!path); -	key.objectid = inode->i_ino; +	if (!path) { +		err = -ENOMEM; +		drop_inode = 1; +		goto out_unlock; +	} +	key.objectid = btrfs_ino(inode);  	key.offset = 0;  	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);  	datasize = btrfs_file_extent_calc_inline_size(name_len); @@ -6798,6 +8758,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  				      datasize);  	if (err) {  		drop_inode = 1; +		btrfs_free_path(path);  		goto out_unlock;  	}  	leaf = path->nodes[0]; @@ -6820,19 +8781,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  	inode->i_mapping->a_ops = &btrfs_symlink_aops;  	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;  	inode_set_bytes(inode, name_len); -	btrfs_i_size_write(inode, name_len - 1); +	btrfs_i_size_write(inode, name_len);  	err = btrfs_update_inode(trans, root, inode);  	if (err)  		drop_inode = 1;  out_unlock: -	nr = trans->blocks_used; -	btrfs_end_transaction_throttle(trans, root); +	if (!err) +		d_instantiate(dentry, inode); +	btrfs_end_transaction(trans, root);  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -6841,9 +8803,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  				       loff_t actual_len, u64 *alloc_hint,  				       struct btrfs_trans_handle *trans)  { +	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct extent_map *em;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_key ins;  	u64 cur_offset = start; +	u64 i_size; +	u64 cur_bytes;  	int ret = 0;  	bool own_trans = true; @@ -6858,8 +8824,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  			}  		} -		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, -					   0, *alloc_hint, (u64)-1, &ins, 1); +		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); +		cur_bytes = max(cur_bytes, min_size); +		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, +					   *alloc_hint, &ins, 1, 0);  		if (ret) {  			if (own_trans)  				btrfs_end_transaction(trans, root); @@ -6871,29 +8839,73 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  						  ins.offset, ins.offset,  						  ins.offset, 0, 0, 0,  						  BTRFS_FILE_EXTENT_PREALLOC); -		BUG_ON(ret); +		if (ret) { +			btrfs_free_reserved_extent(root, ins.objectid, +						   ins.offset, 0); +			btrfs_abort_transaction(trans, root, ret); +			if (own_trans) +				btrfs_end_transaction(trans, root); +			break; +		}  		btrfs_drop_extent_cache(inode, cur_offset,  					cur_offset + ins.offset -1, 0); +		em = alloc_extent_map(); +		if (!em) { +			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +				&BTRFS_I(inode)->runtime_flags); +			goto next; +		} + +		em->start = cur_offset; +		em->orig_start = cur_offset; +		em->len = ins.offset; +		em->block_start = ins.objectid; +		em->block_len = ins.offset; +		em->orig_block_len = ins.offset; +		em->ram_bytes = ins.offset; +		em->bdev = root->fs_info->fs_devices->latest_bdev; +		set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		em->generation = trans->transid; + +		while (1) { +			write_lock(&em_tree->lock); +			ret = add_extent_mapping(em_tree, em, 1); +			write_unlock(&em_tree->lock); +			if (ret != -EEXIST) +				break; +			btrfs_drop_extent_cache(inode, cur_offset, +						cur_offset + ins.offset - 1, +						0); +		} +		free_extent_map(em); +next:  		num_bytes -= ins.offset;  		cur_offset += ins.offset;  		*alloc_hint = ins.objectid + ins.offset; +		inode_inc_iversion(inode);  		inode->i_ctime = CURRENT_TIME;  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&  		    (actual_len > inode->i_size) &&  		    (cur_offset > inode->i_size)) {  			if (cur_offset > actual_len) -				i_size_write(inode, actual_len); +				i_size = actual_len;  			else -				i_size_write(inode, cur_offset); -			i_size_write(inode, cur_offset); -			btrfs_ordered_update_i_size(inode, cur_offset, NULL); +				i_size = cur_offset; +			i_size_write(inode, i_size); +			btrfs_ordered_update_i_size(inode, i_size, NULL);  		}  		ret = btrfs_update_inode(trans, root, inode); -		BUG_ON(ret); + +		if (ret) { +			btrfs_abort_transaction(trans, root, ret); +			if (own_trans) +				btrfs_end_transaction(trans, root); +			break; +		}  		if (own_trans)  			btrfs_end_transaction(trans, root); @@ -6919,118 +8931,84 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,  					   min_size, actual_len, alloc_hint, trans);  } -static long btrfs_fallocate(struct inode *inode, int mode, -			    loff_t offset, loff_t len) +static int btrfs_set_page_dirty(struct page *page)  { -	struct extent_state *cached_state = NULL; -	u64 cur_offset; -	u64 last_byte; -	u64 alloc_start; -	u64 alloc_end; -	u64 alloc_hint = 0; -	u64 locked_end; -	u64 mask = BTRFS_I(inode)->root->sectorsize - 1; -	struct extent_map *em; -	int ret; +	return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	umode_t mode = inode->i_mode; -	alloc_start = offset & ~mask; -	alloc_end =  (offset + len + mask) & ~mask; +	if (mask & MAY_WRITE && +	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { +		if (btrfs_root_readonly(root)) +			return -EROFS; +		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) +			return -EACCES; +	} +	return generic_permission(inode, mask); +} + +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = BTRFS_I(dir)->root; +	struct inode *inode = NULL; +	u64 objectid; +	u64 index; +	int ret = 0;  	/* -	 * wait for ordered IO before we have any locks.  We'll loop again -	 * below with the locks held. +	 * 5 units required for adding orphan entry  	 */ -	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); +	trans = btrfs_start_transaction(root, 5); +	if (IS_ERR(trans)) +		return PTR_ERR(trans); -	mutex_lock(&inode->i_mutex); -	if (alloc_start > inode->i_size) { -		ret = btrfs_cont_expand(inode, alloc_start); -		if (ret) -			goto out; +	ret = btrfs_find_free_ino(root, &objectid); +	if (ret) +		goto out; + +	inode = btrfs_new_inode(trans, root, dir, NULL, 0, +				btrfs_ino(dir), objectid, mode, &index); +	if (IS_ERR(inode)) { +		ret = PTR_ERR(inode); +		inode = NULL; +		goto out;  	} -	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); +	ret = btrfs_init_inode_security(trans, inode, dir, NULL);  	if (ret)  		goto out; -	locked_end = alloc_end - 1; -	while (1) { -		struct btrfs_ordered_extent *ordered; +	ret = btrfs_update_inode(trans, root, inode); +	if (ret) +		goto out; -		/* the extent lock is ordered inside the running -		 * transaction -		 */ -		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, -				 locked_end, 0, &cached_state, GFP_NOFS); -		ordered = btrfs_lookup_first_ordered_extent(inode, -							    alloc_end - 1); -		if (ordered && -		    ordered->file_offset + ordered->len > alloc_start && -		    ordered->file_offset < alloc_end) { -			btrfs_put_ordered_extent(ordered); -			unlock_extent_cached(&BTRFS_I(inode)->io_tree, -					     alloc_start, locked_end, -					     &cached_state, GFP_NOFS); -			/* -			 * we can't wait on the range with the transaction -			 * running or with the extent lock held -			 */ -			btrfs_wait_ordered_range(inode, alloc_start, -						 alloc_end - alloc_start); -		} else { -			if (ordered) -				btrfs_put_ordered_extent(ordered); -			break; -		} -	} +	inode->i_fop = &btrfs_file_operations; +	inode->i_op = &btrfs_file_inode_operations; -	cur_offset = alloc_start; -	while (1) { -		em = btrfs_get_extent(inode, NULL, 0, cur_offset, -				      alloc_end - cur_offset, 0); -		BUG_ON(IS_ERR(em) || !em); -		last_byte = min(extent_map_end(em), alloc_end); -		last_byte = (last_byte + mask) & ~mask; -		if (em->block_start == EXTENT_MAP_HOLE || -		    (cur_offset >= inode->i_size && -		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { -			ret = btrfs_prealloc_file_range(inode, mode, cur_offset, -							last_byte - cur_offset, -							1 << inode->i_blkbits, -							offset + len, -							&alloc_hint); -			if (ret < 0) { -				free_extent_map(em); -				break; -			} -		} -		free_extent_map(em); +	inode->i_mapping->a_ops = &btrfs_aops; +	inode->i_mapping->backing_dev_info = &root->fs_info->bdi; +	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -		cur_offset = last_byte; -		if (cur_offset >= alloc_end) { -			ret = 0; -			break; -		} -	} -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, -			     &cached_state, GFP_NOFS); +	ret = btrfs_orphan_add(trans, inode); +	if (ret) +		goto out; -	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: -	mutex_unlock(&inode->i_mutex); -	return ret; -} +	d_tmpfile(dentry, inode); +	mark_inode_dirty(inode); -static int btrfs_set_page_dirty(struct page *page) -{ -	return __set_page_dirty_nobuffers(page); -} +out: +	btrfs_end_transaction(trans, root); +	if (ret) +		iput(inode); +	btrfs_balance_delayed_items(root); +	btrfs_btree_balance_dirty(root); -static int btrfs_permission(struct inode *inode, int mask) -{ -	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) -		return -EACCES; -	return generic_permission(inode, mask, btrfs_check_acl); +	return ret;  }  static const struct inode_operations btrfs_dir_inode_operations = { @@ -7050,16 +9028,23 @@ static const struct inode_operations btrfs_dir_inode_operations = {  	.listxattr	= btrfs_listxattr,  	.removexattr	= btrfs_removexattr,  	.permission	= btrfs_permission, +	.get_acl	= btrfs_get_acl, +	.set_acl	= btrfs_set_acl, +	.update_time	= btrfs_update_time, +	.tmpfile        = btrfs_tmpfile,  };  static const struct inode_operations btrfs_dir_ro_inode_operations = {  	.lookup		= btrfs_lookup,  	.permission	= btrfs_permission, +	.get_acl	= btrfs_get_acl, +	.set_acl	= btrfs_set_acl, +	.update_time	= btrfs_update_time,  };  static const struct file_operations btrfs_dir_file_operations = {  	.llseek		= generic_file_llseek,  	.read		= generic_read_dir, -	.readdir	= btrfs_real_readdir, +	.iterate	= btrfs_real_readdir,  	.unlocked_ioctl	= btrfs_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= btrfs_ioctl, @@ -7075,7 +9060,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {  	.readpage_end_io_hook = btrfs_readpage_end_io_hook,  	.writepage_end_io_hook = btrfs_writepage_end_io_hook,  	.writepage_start_hook = btrfs_writepage_start_hook, -	.readpage_io_failed_hook = btrfs_io_failed_hook,  	.set_bit_hook = btrfs_set_bit_hook,  	.clear_bit_hook = btrfs_clear_bit_hook,  	.merge_extent_hook = btrfs_merge_extent_hook, @@ -7099,7 +9083,6 @@ static const struct address_space_operations btrfs_aops = {  	.writepage	= btrfs_writepage,  	.writepages	= btrfs_writepages,  	.readpages	= btrfs_readpages, -	.sync_page	= block_sync_page,  	.direct_IO	= btrfs_direct_IO,  	.invalidatepage = btrfs_invalidatepage,  	.releasepage	= btrfs_releasepage, @@ -7115,7 +9098,6 @@ static const struct address_space_operations btrfs_symlink_aops = {  };  static const struct inode_operations btrfs_file_inode_operations = { -	.truncate	= btrfs_truncate,  	.getattr	= btrfs_getattr,  	.setattr	= btrfs_setattr,  	.setxattr	= btrfs_setxattr, @@ -7123,8 +9105,10 @@ static const struct inode_operations btrfs_file_inode_operations = {  	.listxattr      = btrfs_listxattr,  	.removexattr	= btrfs_removexattr,  	.permission	= btrfs_permission, -	.fallocate	= btrfs_fallocate,  	.fiemap		= btrfs_fiemap, +	.get_acl	= btrfs_get_acl, +	.set_acl	= btrfs_set_acl, +	.update_time	= btrfs_update_time,  };  static const struct inode_operations btrfs_special_inode_operations = {  	.getattr	= btrfs_getattr, @@ -7134,18 +9118,25 @@ static const struct inode_operations btrfs_special_inode_operations = {  	.getxattr	= btrfs_getxattr,  	.listxattr	= btrfs_listxattr,  	.removexattr	= btrfs_removexattr, +	.get_acl	= btrfs_get_acl, +	.set_acl	= btrfs_set_acl, +	.update_time	= btrfs_update_time,  };  static const struct inode_operations btrfs_symlink_inode_operations = {  	.readlink	= generic_readlink,  	.follow_link	= page_follow_link_light,  	.put_link	= page_put_link, +	.getattr	= btrfs_getattr, +	.setattr	= btrfs_setattr,  	.permission	= btrfs_permission,  	.setxattr	= btrfs_setxattr,  	.getxattr	= btrfs_getxattr,  	.listxattr	= btrfs_listxattr,  	.removexattr	= btrfs_removexattr, +	.update_time	= btrfs_update_time,  };  const struct dentry_operations btrfs_dentry_operations = {  	.d_delete	= btrfs_dentry_delete, +	.d_release	= btrfs_dentry_release,  };  | 
