diff options
Diffstat (limited to 'fs/btrfs/ordered-data.c')
| -rw-r--r-- | fs/btrfs/ordered-data.c | 757 | 
1 files changed, 518 insertions, 239 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f4621f6deca..7187b14faa6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -24,6 +24,9 @@  #include "transaction.h"  #include "btrfs_inode.h"  #include "extent_io.h" +#include "disk-io.h" + +static struct kmem_cache *btrfs_ordered_extent_cache;  static u64 entry_end(struct btrfs_ordered_extent *entry)  { @@ -59,6 +62,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,  	return NULL;  } +static void ordered_data_tree_panic(struct inode *inode, int errno, +					       u64 offset) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " +		    "%llu", offset); +} +  /*   * look for a given offset in the tree, and if it can't be found return the   * first lesser offset @@ -141,7 +152,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,  					  u64 file_offset)  {  	struct rb_root *root = &tree->tree; -	struct rb_node *prev; +	struct rb_node *prev = NULL;  	struct rb_node *ret;  	struct btrfs_ordered_extent *entry; @@ -172,23 +183,29 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,   */  static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  				      u64 start, u64 len, u64 disk_len, -				      int type, int dio) +				      int type, int dio, int compress_type)  { +	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node;  	struct btrfs_ordered_extent *entry;  	tree = &BTRFS_I(inode)->ordered_tree; -	entry = kzalloc(sizeof(*entry), GFP_NOFS); +	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);  	if (!entry)  		return -ENOMEM;  	entry->file_offset = file_offset;  	entry->start = start;  	entry->len = len; +	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && +	    !(type == BTRFS_ORDERED_NOCOW)) +		entry->csum_bytes_left = disk_len;  	entry->disk_len = disk_len;  	entry->bytes_left = len; -	entry->inode = inode; +	entry->inode = igrab(inode); +	entry->compress_type = compress_type; +	entry->truncated_len = (u64)-1;  	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)  		set_bit(type, &entry->flags); @@ -200,19 +217,32 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	init_waitqueue_head(&entry->wait);  	INIT_LIST_HEAD(&entry->list);  	INIT_LIST_HEAD(&entry->root_extent_list); +	INIT_LIST_HEAD(&entry->work_list); +	init_completion(&entry->completion); +	INIT_LIST_HEAD(&entry->log_list); + +	trace_btrfs_ordered_extent_add(inode, entry); -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_insert(&tree->tree, file_offset,  			   &entry->rb_node); -	BUG_ON(node); -	spin_unlock(&tree->lock); +	if (node) +		ordered_data_tree_panic(inode, -EEXIST, file_offset); +	spin_unlock_irq(&tree->lock); -	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); +	spin_lock(&root->ordered_extent_lock);  	list_add_tail(&entry->root_extent_list, -		      &BTRFS_I(inode)->root->fs_info->ordered_extents); -	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); +		      &root->ordered_extents); +	root->nr_ordered_extents++; +	if (root->nr_ordered_extents == 1) { +		spin_lock(&root->fs_info->ordered_root_lock); +		BUG_ON(!list_empty(&root->ordered_root)); +		list_add_tail(&root->ordered_root, +			      &root->fs_info->ordered_roots); +		spin_unlock(&root->fs_info->ordered_root_lock); +	} +	spin_unlock(&root->ordered_extent_lock); -	BUG_ON(node);  	return 0;  } @@ -220,14 +250,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  			     u64 start, u64 len, u64 disk_len, int type)  {  	return __btrfs_add_ordered_extent(inode, file_offset, start, len, -					  disk_len, type, 0); +					  disk_len, type, 0, +					  BTRFS_COMPRESS_NONE);  }  int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,  				 u64 start, u64 len, u64 disk_len, int type)  {  	return __btrfs_add_ordered_extent(inode, file_offset, start, len, -					  disk_len, type, 1); +					  disk_len, type, 1, +					  BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, +				      u64 start, u64 len, u64 disk_len, +				      int type, int compress_type) +{ +	return __btrfs_add_ordered_extent(inode, file_offset, start, len, +					  disk_len, type, 0, +					  compress_type);  }  /* @@ -235,17 +276,93 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,   * when an ordered extent is finished.  If the list covers more than one   * ordered extent, it is split across multiples.   */ -int btrfs_add_ordered_sum(struct inode *inode, -			  struct btrfs_ordered_extent *entry, -			  struct btrfs_ordered_sum *sum) +void btrfs_add_ordered_sum(struct inode *inode, +			   struct btrfs_ordered_extent *entry, +			   struct btrfs_ordered_sum *sum)  {  	struct btrfs_ordered_inode_tree *tree;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	list_add_tail(&sum->list, &entry->list); -	spin_unlock(&tree->lock); -	return 0; +	WARN_ON(entry->csum_bytes_left < sum->len); +	entry->csum_bytes_left -= sum->len; +	if (entry->csum_bytes_left == 0) +		wake_up(&entry->wait); +	spin_unlock_irq(&tree->lock); +} + +/* + * this is used to account for finished IO across a given range + * of the file.  The IO may span ordered extents.  If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete.  This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, +				   struct btrfs_ordered_extent **cached, +				   u64 *file_offset, u64 io_size, int uptodate) +{ +	struct btrfs_ordered_inode_tree *tree; +	struct rb_node *node; +	struct btrfs_ordered_extent *entry = NULL; +	int ret; +	unsigned long flags; +	u64 dec_end; +	u64 dec_start; +	u64 to_dec; + +	tree = &BTRFS_I(inode)->ordered_tree; +	spin_lock_irqsave(&tree->lock, flags); +	node = tree_search(tree, *file_offset); +	if (!node) { +		ret = 1; +		goto out; +	} + +	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +	if (!offset_in_entry(entry, *file_offset)) { +		ret = 1; +		goto out; +	} + +	dec_start = max(*file_offset, entry->file_offset); +	dec_end = min(*file_offset + io_size, entry->file_offset + +		      entry->len); +	*file_offset = dec_end; +	if (dec_start > dec_end) { +		btrfs_crit(BTRFS_I(inode)->root->fs_info, +			"bad ordering dec_start %llu end %llu", dec_start, dec_end); +	} +	to_dec = dec_end - dec_start; +	if (to_dec > entry->bytes_left) { +		btrfs_crit(BTRFS_I(inode)->root->fs_info, +			"bad ordered accounting left %llu size %llu", +			entry->bytes_left, to_dec); +	} +	entry->bytes_left -= to_dec; +	if (!uptodate) +		set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + +	if (entry->bytes_left == 0) { +		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); +		if (waitqueue_active(&entry->wait)) +			wake_up(&entry->wait); +	} else { +		ret = 1; +	} +out: +	if (!ret && cached && entry) { +		*cached = entry; +		atomic_inc(&entry->refs); +	} +	spin_unlock_irqrestore(&tree->lock, flags); +	return ret == 0;  }  /* @@ -259,15 +376,21 @@ int btrfs_add_ordered_sum(struct inode *inode,   */  int btrfs_dec_test_ordered_pending(struct inode *inode,  				   struct btrfs_ordered_extent **cached, -				   u64 file_offset, u64 io_size) +				   u64 file_offset, u64 io_size, int uptodate)  {  	struct btrfs_ordered_inode_tree *tree;  	struct rb_node *node;  	struct btrfs_ordered_extent *entry = NULL; +	unsigned long flags;  	int ret;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irqsave(&tree->lock, flags); +	if (cached && *cached) { +		entry = *cached; +		goto have_entry; +	} +  	node = tree_search(tree, file_offset);  	if (!node) {  		ret = 1; @@ -275,71 +398,178 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,  	}  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry:  	if (!offset_in_entry(entry, file_offset)) {  		ret = 1;  		goto out;  	}  	if (io_size > entry->bytes_left) { -		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", -		       (unsigned long long)entry->bytes_left, -		       (unsigned long long)io_size); +		btrfs_crit(BTRFS_I(inode)->root->fs_info, +			   "bad ordered accounting left %llu size %llu", +		       entry->bytes_left, io_size);  	}  	entry->bytes_left -= io_size; -	if (entry->bytes_left == 0) +	if (!uptodate) +		set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + +	if (entry->bytes_left == 0) {  		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); -	else +		if (waitqueue_active(&entry->wait)) +			wake_up(&entry->wait); +	} else {  		ret = 1; +	}  out:  	if (!ret && cached && entry) {  		*cached = entry;  		atomic_inc(&entry->refs);  	} -	spin_unlock(&tree->lock); +	spin_unlock_irqrestore(&tree->lock, flags);  	return ret == 0;  } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct inode *inode, +			      struct list_head *logged_list) +{ +	struct btrfs_ordered_inode_tree *tree; +	struct btrfs_ordered_extent *ordered; +	struct rb_node *n; + +	tree = &BTRFS_I(inode)->ordered_tree; +	spin_lock_irq(&tree->lock); +	for (n = rb_first(&tree->tree); n; n = rb_next(n)) { +		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); +		if (!list_empty(&ordered->log_list)) +			continue; +		list_add_tail(&ordered->log_list, logged_list); +		atomic_inc(&ordered->refs); +	} +	spin_unlock_irq(&tree->lock); +} + +void btrfs_put_logged_extents(struct list_head *logged_list) +{ +	struct btrfs_ordered_extent *ordered; + +	while (!list_empty(logged_list)) { +		ordered = list_first_entry(logged_list, +					   struct btrfs_ordered_extent, +					   log_list); +		list_del_init(&ordered->log_list); +		btrfs_put_ordered_extent(ordered); +	} +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, +				 struct btrfs_root *log) +{ +	int index = log->log_transid % 2; + +	spin_lock_irq(&log->log_extents_lock[index]); +	list_splice_tail(logged_list, &log->logged_list[index]); +	spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ +	struct btrfs_ordered_extent *ordered; +	int index = transid % 2; + +	spin_lock_irq(&log->log_extents_lock[index]); +	while (!list_empty(&log->logged_list[index])) { +		ordered = list_first_entry(&log->logged_list[index], +					   struct btrfs_ordered_extent, +					   log_list); +		list_del_init(&ordered->log_list); +		spin_unlock_irq(&log->log_extents_lock[index]); + +		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && +		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { +			struct inode *inode = ordered->inode; +			u64 start = ordered->file_offset; +			u64 end = ordered->file_offset + ordered->len - 1; + +			WARN_ON(!inode); +			filemap_fdatawrite_range(inode->i_mapping, start, end); +		} +		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, +						   &ordered->flags)); + +		btrfs_put_ordered_extent(ordered); +		spin_lock_irq(&log->log_extents_lock[index]); +	} +	spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ +	struct btrfs_ordered_extent *ordered; +	int index = transid % 2; + +	spin_lock_irq(&log->log_extents_lock[index]); +	while (!list_empty(&log->logged_list[index])) { +		ordered = list_first_entry(&log->logged_list[index], +					   struct btrfs_ordered_extent, +					   log_list); +		list_del_init(&ordered->log_list); +		spin_unlock_irq(&log->log_extents_lock[index]); +		btrfs_put_ordered_extent(ordered); +		spin_lock_irq(&log->log_extents_lock[index]); +	} +	spin_unlock_irq(&log->log_extents_lock[index]); +} +  /*   * used to drop a reference on an ordered extent.  This will free   * the extent if the last reference is dropped   */ -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)  {  	struct list_head *cur;  	struct btrfs_ordered_sum *sum; +	trace_btrfs_ordered_extent_put(entry->inode, entry); +  	if (atomic_dec_and_test(&entry->refs)) { +		if (entry->inode) +			btrfs_add_delayed_iput(entry->inode);  		while (!list_empty(&entry->list)) {  			cur = entry->list.next;  			sum = list_entry(cur, struct btrfs_ordered_sum, list);  			list_del(&sum->list);  			kfree(sum);  		} -		kfree(entry); +		kmem_cache_free(btrfs_ordered_extent_cache, entry);  	} -	return 0;  }  /*   * remove an ordered extent from the tree.  No references are dropped - * and you must wake_up entry->wait.  You must hold the tree lock - * while you call this function. + * and waiters are woken up.   */ -static int __btrfs_remove_ordered_extent(struct inode *inode, -				struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, +				 struct btrfs_ordered_extent *entry)  {  	struct btrfs_ordered_inode_tree *tree;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct rb_node *node;  	tree = &BTRFS_I(inode)->ordered_tree; +	spin_lock_irq(&tree->lock);  	node = &entry->rb_node;  	rb_erase(node, &tree->tree); -	tree->last = NULL; +	if (tree->last == node) +		tree->last = NULL;  	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); +	spin_unlock_irq(&tree->lock); -	spin_lock(&root->fs_info->ordered_extent_lock); +	spin_lock(&root->ordered_extent_lock);  	list_del_init(&entry->root_extent_list); +	root->nr_ordered_extents--; + +	trace_btrfs_ordered_extent_remove(inode, entry);  	/*  	 * we have no more ordered extents for this inode and @@ -348,86 +578,112 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,  	 */  	if (RB_EMPTY_ROOT(&tree->tree) &&  	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { +		spin_lock(&root->fs_info->ordered_root_lock);  		list_del_init(&BTRFS_I(inode)->ordered_operations); +		spin_unlock(&root->fs_info->ordered_root_lock);  	} -	spin_unlock(&root->fs_info->ordered_extent_lock); -	return 0; +	if (!root->nr_ordered_extents) { +		spin_lock(&root->fs_info->ordered_root_lock); +		BUG_ON(list_empty(&root->ordered_root)); +		list_del_init(&root->ordered_root); +		spin_unlock(&root->fs_info->ordered_root_lock); +	} +	spin_unlock(&root->ordered_extent_lock); +	wake_up(&entry->wait);  } -/* - * remove an ordered extent from the tree.  No references are dropped - * but any waiters are woken. - */ -int btrfs_remove_ordered_extent(struct inode *inode, -				struct btrfs_ordered_extent *entry) +static void btrfs_run_ordered_extent_work(struct btrfs_work *work)  { -	struct btrfs_ordered_inode_tree *tree; -	int ret; - -	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); -	ret = __btrfs_remove_ordered_extent(inode, entry); -	spin_unlock(&tree->lock); -	wake_up(&entry->wait); +	struct btrfs_ordered_extent *ordered; -	return ret; +	ordered = container_of(work, struct btrfs_ordered_extent, flush_work); +	btrfs_start_ordered_extent(ordered->inode, ordered, 1); +	complete(&ordered->completion);  }  /*   * wait for all the ordered extents in a root.  This is done when balancing   * space between drives.   */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, -			       int nocow_only, int delay_iput) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)  { -	struct list_head splice; -	struct list_head *cur; -	struct btrfs_ordered_extent *ordered; -	struct inode *inode; +	struct list_head splice, works; +	struct btrfs_ordered_extent *ordered, *next; +	int count = 0;  	INIT_LIST_HEAD(&splice); +	INIT_LIST_HEAD(&works); + +	mutex_lock(&root->ordered_extent_mutex); +	spin_lock(&root->ordered_extent_lock); +	list_splice_init(&root->ordered_extents, &splice); +	while (!list_empty(&splice) && nr) { +		ordered = list_first_entry(&splice, struct btrfs_ordered_extent, +					   root_extent_list); +		list_move_tail(&ordered->root_extent_list, +			       &root->ordered_extents); +		atomic_inc(&ordered->refs); +		spin_unlock(&root->ordered_extent_lock); -	spin_lock(&root->fs_info->ordered_extent_lock); -	list_splice_init(&root->fs_info->ordered_extents, &splice); -	while (!list_empty(&splice)) { -		cur = splice.next; -		ordered = list_entry(cur, struct btrfs_ordered_extent, -				     root_extent_list); -		if (nocow_only && -		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && -		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { -			list_move(&ordered->root_extent_list, -				  &root->fs_info->ordered_extents); -			cond_resched_lock(&root->fs_info->ordered_extent_lock); -			continue; -		} +		btrfs_init_work(&ordered->flush_work, +				btrfs_run_ordered_extent_work, NULL, NULL); +		list_add_tail(&ordered->work_list, &works); +		btrfs_queue_work(root->fs_info->flush_workers, +				 &ordered->flush_work); -		list_del_init(&ordered->root_extent_list); -		atomic_inc(&ordered->refs); +		cond_resched(); +		spin_lock(&root->ordered_extent_lock); +		if (nr != -1) +			nr--; +		count++; +	} +	list_splice_tail(&splice, &root->ordered_extents); +	spin_unlock(&root->ordered_extent_lock); -		/* -		 * the inode may be getting freed (in sys_unlink path). -		 */ -		inode = igrab(ordered->inode); +	list_for_each_entry_safe(ordered, next, &works, work_list) { +		list_del_init(&ordered->work_list); +		wait_for_completion(&ordered->completion); +		btrfs_put_ordered_extent(ordered); +		cond_resched(); +	} +	mutex_unlock(&root->ordered_extent_mutex); -		spin_unlock(&root->fs_info->ordered_extent_lock); +	return count; +} -		if (inode) { -			btrfs_start_ordered_extent(inode, ordered, 1); -			btrfs_put_ordered_extent(ordered); -			if (delay_iput) -				btrfs_add_delayed_iput(inode); -			else -				iput(inode); -		} else { -			btrfs_put_ordered_extent(ordered); -		} +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +{ +	struct btrfs_root *root; +	struct list_head splice; +	int done; + +	INIT_LIST_HEAD(&splice); -		spin_lock(&root->fs_info->ordered_extent_lock); +	mutex_lock(&fs_info->ordered_operations_mutex); +	spin_lock(&fs_info->ordered_root_lock); +	list_splice_init(&fs_info->ordered_roots, &splice); +	while (!list_empty(&splice) && nr) { +		root = list_first_entry(&splice, struct btrfs_root, +					ordered_root); +		root = btrfs_grab_fs_root(root); +		BUG_ON(!root); +		list_move_tail(&root->ordered_root, +			       &fs_info->ordered_roots); +		spin_unlock(&fs_info->ordered_root_lock); + +		done = btrfs_wait_ordered_extents(root, nr); +		btrfs_put_fs_root(root); + +		spin_lock(&fs_info->ordered_root_lock); +		if (nr != -1) { +			nr -= done; +			WARN_ON(nr < 0); +		}  	} -	spin_unlock(&root->fs_info->ordered_extent_lock); -	return 0; +	list_splice_tail(&splice, &fs_info->ordered_roots); +	spin_unlock(&fs_info->ordered_root_lock); +	mutex_unlock(&fs_info->ordered_operations_mutex);  }  /* @@ -440,23 +696,26 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,   * extra check to make sure the ordered operation list really is empty   * before we return   */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, int wait)  {  	struct btrfs_inode *btrfs_inode;  	struct inode *inode; +	struct btrfs_transaction *cur_trans = trans->transaction;  	struct list_head splice; +	struct list_head works; +	struct btrfs_delalloc_work *work, *next; +	int ret = 0;  	INIT_LIST_HEAD(&splice); +	INIT_LIST_HEAD(&works); -	mutex_lock(&root->fs_info->ordered_operations_mutex); -	spin_lock(&root->fs_info->ordered_extent_lock); -again: -	list_splice_init(&root->fs_info->ordered_operations, &splice); - +	mutex_lock(&root->fs_info->ordered_extent_flush_mutex); +	spin_lock(&root->fs_info->ordered_root_lock); +	list_splice_init(&cur_trans->ordered_operations, &splice);  	while (!list_empty(&splice)) {  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,  				   ordered_operations); -  		inode = &btrfs_inode->vfs_inode;  		list_del_init(&btrfs_inode->ordered_operations); @@ -465,31 +724,41 @@ again:  		 * the inode may be getting freed (in sys_unlink path).  		 */  		inode = igrab(inode); +		if (!inode) +			continue; -		if (!wait && inode) { +		if (!wait)  			list_add_tail(&BTRFS_I(inode)->ordered_operations, -			      &root->fs_info->ordered_operations); -		} -		spin_unlock(&root->fs_info->ordered_extent_lock); - -		if (inode) { -			if (wait) -				btrfs_wait_ordered_range(inode, 0, (u64)-1); -			else -				filemap_flush(inode->i_mapping); -			btrfs_add_delayed_iput(inode); +				      &cur_trans->ordered_operations); +		spin_unlock(&root->fs_info->ordered_root_lock); + +		work = btrfs_alloc_delalloc_work(inode, wait, 1); +		if (!work) { +			spin_lock(&root->fs_info->ordered_root_lock); +			if (list_empty(&BTRFS_I(inode)->ordered_operations)) +				list_add_tail(&btrfs_inode->ordered_operations, +					      &splice); +			list_splice_tail(&splice, +					 &cur_trans->ordered_operations); +			spin_unlock(&root->fs_info->ordered_root_lock); +			ret = -ENOMEM; +			goto out;  		} +		list_add_tail(&work->list, &works); +		btrfs_queue_work(root->fs_info->flush_workers, +				 &work->work);  		cond_resched(); -		spin_lock(&root->fs_info->ordered_extent_lock); +		spin_lock(&root->fs_info->ordered_root_lock);  	} -	if (wait && !list_empty(&root->fs_info->ordered_operations)) -		goto again; - -	spin_unlock(&root->fs_info->ordered_extent_lock); -	mutex_unlock(&root->fs_info->ordered_operations_mutex); - -	return 0; +	spin_unlock(&root->fs_info->ordered_root_lock); +out: +	list_for_each_entry_safe(work, next, &works, list) { +		list_del_init(&work->list); +		btrfs_wait_and_free_delalloc_work(work); +	} +	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); +	return ret;  }  /* @@ -506,10 +775,12 @@ void btrfs_start_ordered_extent(struct inode *inode,  	u64 start = entry->file_offset;  	u64 end = start + entry->len - 1; +	trace_btrfs_ordered_extent_start(inode, entry); +  	/*  	 * pages in the range can be dirty, clean or writeback.  We  	 * start IO on any dirty ones so the wait doesn't stall waiting -	 * for pdflush to find them +	 * for the flusher thread to find them  	 */  	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))  		filemap_fdatawrite_range(inode->i_mapping, start, end); @@ -524,10 +795,10 @@ void btrfs_start_ordered_extent(struct inode *inode,   */  int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  { +	int ret = 0;  	u64 end;  	u64 orig_end;  	struct btrfs_ordered_extent *ordered; -	int found;  	if (start + len < start) {  		orig_end = INT_LIMIT(loff_t); @@ -536,22 +807,39 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  		if (orig_end > INT_LIMIT(loff_t))  			orig_end = INT_LIMIT(loff_t);  	} -again: +  	/* start IO across the range first to instantiate any delalloc  	 * extents  	 */ -	filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - -	/* The compression code will leave pages locked but return from -	 * writepage without setting the page writeback.  Starting again -	 * with WB_SYNC_ALL will end up waiting for the IO to actually start. +	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); +	if (ret) +		return ret; +	/* +	 * So with compression we will find and lock a dirty page and clear the +	 * first one as dirty, setup an async extent, and immediately return +	 * with the entire range locked but with nobody actually marked with +	 * writeback.  So we can't just filemap_write_and_wait_range() and +	 * expect it to work since it will just kick off a thread to do the +	 * actual work.  So we need to call filemap_fdatawrite_range _again_ +	 * since it will wait on the page lock, which won't be unlocked until +	 * after the pages have been marked as writeback and so we're good to go +	 * from there.  We have to do this otherwise we'll miss the ordered +	 * extents and that results in badness.  Please Josef, do not think you +	 * know better and pull this out at some point in the future, it is +	 * right and you are wrong.  	 */ -	filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - -	filemap_fdatawait_range(inode->i_mapping, start, orig_end); +	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +		     &BTRFS_I(inode)->runtime_flags)) { +		ret = filemap_fdatawrite_range(inode->i_mapping, start, +					       orig_end); +		if (ret) +			return ret; +	} +	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); +	if (ret) +		return ret;  	end = orig_end; -	found = 0;  	while (1) {  		ordered = btrfs_lookup_first_ordered_extent(inode, end);  		if (!ordered) @@ -560,24 +848,20 @@ again:  			btrfs_put_ordered_extent(ordered);  			break;  		} -		if (ordered->file_offset + ordered->len < start) { +		if (ordered->file_offset + ordered->len <= start) {  			btrfs_put_ordered_extent(ordered);  			break;  		} -		found++;  		btrfs_start_ordered_extent(inode, ordered, 1);  		end = ordered->file_offset; +		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) +			ret = -EIO;  		btrfs_put_ordered_extent(ordered); -		if (end == 0 || end == start) +		if (ret || end == 0 || end == start)  			break;  		end--;  	} -	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, -			   EXTENT_DELALLOC, 0, NULL)) { -		schedule_timeout(1); -		goto again; -	} -	return 0; +	return ret;  }  /* @@ -592,7 +876,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node)  		goto out; @@ -603,7 +887,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,  	if (entry)  		atomic_inc(&entry->refs);  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -619,7 +903,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node) {  		node = tree_search(tree, file_offset + len); @@ -644,7 +928,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  out:  	if (entry)  		atomic_inc(&entry->refs); -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -660,7 +944,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)  	struct btrfs_ordered_extent *entry = NULL;  	tree = &BTRFS_I(inode)->ordered_tree; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	node = tree_search(tree, file_offset);  	if (!node)  		goto out; @@ -668,7 +952,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);  	atomic_inc(&entry->refs);  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	return entry;  } @@ -680,22 +964,24 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  				struct btrfs_ordered_extent *ordered)  {  	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	u64 disk_i_size;  	u64 new_i_size; -	u64 i_size_test;  	u64 i_size = i_size_read(inode);  	struct rb_node *node;  	struct rb_node *prev = NULL;  	struct btrfs_ordered_extent *test;  	int ret = 1; -	if (ordered) +	spin_lock_irq(&tree->lock); +	if (ordered) {  		offset = entry_end(ordered); -	else +		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) +			offset = min(offset, +				     ordered->file_offset + +				     ordered->truncated_len); +	} else {  		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - -	spin_lock(&tree->lock); +	}  	disk_i_size = BTRFS_I(inode)->disk_i_size;  	/* truncate file */ @@ -709,18 +995,17 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  	 * if the disk i_size is already at the inode->i_size, or  	 * this ordered extent is inside the disk i_size, we're done  	 */ -	if (disk_i_size == i_size || offset <= disk_i_size) { +	if (disk_i_size == i_size)  		goto out; -	}  	/* -	 * we can't update the disk_isize if there are delalloc bytes -	 * between disk_i_size and  this ordered extent +	 * We still need to update disk_i_size if outstanding_isize is greater +	 * than disk_i_size.  	 */ -	if (test_range_bit(io_tree, disk_i_size, offset - 1, -			   EXTENT_DELALLOC, 0, NULL)) { +	if (offset <= disk_i_size && +	    (!ordered || ordered->outstanding_isize <= disk_i_size))  		goto out; -	} +  	/*  	 * walk backward from this ordered extent to disk_i_size.  	 * if we find an ordered extent then we can't update disk i_size @@ -741,69 +1026,53 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  		}  		node = prev;  	} -	while (node) { +	for (; node; node = rb_prev(node)) {  		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + +		/* We treat this entry as if it doesnt exist */ +		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) +			continue;  		if (test->file_offset + test->len <= disk_i_size)  			break;  		if (test->file_offset >= i_size)  			break; -		if (test->file_offset >= disk_i_size) +		if (entry_end(test) > disk_i_size) { +			/* +			 * we don't update disk_i_size now, so record this +			 * undealt i_size. Or we will not know the real +			 * i_size. +			 */ +			if (test->outstanding_isize < offset) +				test->outstanding_isize = offset; +			if (ordered && +			    ordered->outstanding_isize > +			    test->outstanding_isize) +				test->outstanding_isize = +						ordered->outstanding_isize;  			goto out; -		node = rb_prev(node); +		}  	}  	new_i_size = min_t(u64, offset, i_size);  	/* -	 * at this point, we know we can safely update i_size to at least -	 * the offset from this ordered extent.  But, we need to -	 * walk forward and see if ios from higher up in the file have -	 * finished. -	 */ -	if (ordered) { -		node = rb_next(&ordered->rb_node); -	} else { -		if (prev) -			node = rb_next(prev); -		else -			node = rb_first(&tree->tree); -	} -	i_size_test = 0; -	if (node) { -		/* -		 * do we have an area where IO might have finished -		 * between our ordered extent and the next one. -		 */ -		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); -		if (test->file_offset > offset) -			i_size_test = test->file_offset; -	} else { -		i_size_test = i_size; -	} - -	/* -	 * i_size_test is the end of a region after this ordered -	 * extent where there are no ordered extents.  As long as there -	 * are no delalloc bytes in this area, it is safe to update -	 * disk_i_size to the end of the region. +	 * Some ordered extents may completed before the current one, and +	 * we hold the real i_size in ->outstanding_isize.  	 */ -	if (i_size_test > offset && -	    !test_range_bit(io_tree, offset, i_size_test - 1, -			    EXTENT_DELALLOC, 0, NULL)) { -		new_i_size = min_t(u64, i_size_test, i_size); -	} +	if (ordered && ordered->outstanding_isize > new_i_size) +		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);  	BTRFS_I(inode)->disk_i_size = new_i_size;  	ret = 0;  out:  	/* -	 * we need to remove the ordered extent with the tree lock held -	 * so that other people calling this function don't find our fully -	 * processed ordered entry and skip updating the i_size +	 * We need to do this because we can't remove ordered extents until +	 * after the i_disk_size has been updated and then the inode has been +	 * updated to reflect the change, so we need to tell anybody who finds +	 * this ordered extent that we've already done all the real work, we +	 * just haven't completed all the other work.  	 */  	if (ordered) -		__btrfs_remove_ordered_extent(inode, ordered); -	spin_unlock(&tree->lock); -	if (ordered) -		wake_up(&ordered->wait); +		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); +	spin_unlock_irq(&tree->lock);  	return ret;  } @@ -813,39 +1082,42 @@ out:   * be reclaimed before their checksum is actually put into the btree   */  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, -			   u32 *sum) +			   u32 *sum, int len)  {  	struct btrfs_ordered_sum *ordered_sum; -	struct btrfs_sector_sum *sector_sums;  	struct btrfs_ordered_extent *ordered;  	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;  	unsigned long num_sectors;  	unsigned long i;  	u32 sectorsize = BTRFS_I(inode)->root->sectorsize; -	int ret = 1; +	int index = 0;  	ordered = btrfs_lookup_ordered_extent(inode, offset);  	if (!ordered) -		return 1; +		return 0; -	spin_lock(&tree->lock); +	spin_lock_irq(&tree->lock);  	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { -		if (disk_bytenr >= ordered_sum->bytenr) { -			num_sectors = ordered_sum->len / sectorsize; -			sector_sums = ordered_sum->sums; -			for (i = 0; i < num_sectors; i++) { -				if (sector_sums[i].bytenr == disk_bytenr) { -					*sum = sector_sums[i].sum; -					ret = 0; -					goto out; -				} -			} +		if (disk_bytenr >= ordered_sum->bytenr && +		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { +			i = (disk_bytenr - ordered_sum->bytenr) >> +			    inode->i_sb->s_blocksize_bits; +			num_sectors = ordered_sum->len >> +				      inode->i_sb->s_blocksize_bits; +			num_sectors = min_t(int, len - index, num_sectors - i); +			memcpy(sum + index, ordered_sum->sums + i, +			       num_sectors); + +			index += (int)num_sectors; +			if (index == len) +				goto out; +			disk_bytenr += num_sectors * sectorsize;  		}  	}  out: -	spin_unlock(&tree->lock); +	spin_unlock_irq(&tree->lock);  	btrfs_put_ordered_extent(ordered); -	return ret; +	return index;  } @@ -861,10 +1133,10 @@ out:   * If trans is not null, we'll do a friendly check for a transaction that   * is already flushing things and force the IO down ourselves.   */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, -				struct btrfs_root *root, -				struct inode *inode) +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, struct inode *inode)  { +	struct btrfs_transaction *cur_trans = trans->transaction;  	u64 last_mod;  	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -873,24 +1145,31 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  	 * if this file hasn't been changed since the last transaction  	 * commit, we can safely return without doing anything  	 */ -	if (last_mod < root->fs_info->last_trans_committed) -		return 0; +	if (last_mod <= root->fs_info->last_trans_committed) +		return; -	/* -	 * the transaction is already committing.  Just start the IO and -	 * don't bother with all of this list nonsense -	 */ -	if (trans && root->fs_info->running_transaction->blocked) { -		btrfs_wait_ordered_range(inode, 0, (u64)-1); -		return 0; -	} - -	spin_lock(&root->fs_info->ordered_extent_lock); +	spin_lock(&root->fs_info->ordered_root_lock);  	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {  		list_add_tail(&BTRFS_I(inode)->ordered_operations, -			      &root->fs_info->ordered_operations); +			      &cur_trans->ordered_operations);  	} -	spin_unlock(&root->fs_info->ordered_extent_lock); +	spin_unlock(&root->fs_info->ordered_root_lock); +} + +int __init ordered_data_init(void) +{ +	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", +				     sizeof(struct btrfs_ordered_extent), 0, +				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, +				     NULL); +	if (!btrfs_ordered_extent_cache) +		return -ENOMEM;  	return 0;  } + +void ordered_data_exit(void) +{ +	if (btrfs_ordered_extent_cache) +		kmem_cache_destroy(btrfs_ordered_extent_cache); +}  | 
