Btrfs: serialize unlocked dio reads with truncate

Currently, we can do unlocked dio reads, but the following race is possible: dio_read_task truncate_task ->btrfs_setattr() ->btrfs_direct_IO ->__blockdev_direct_IO ->btrfs_get_block ->btrfs_truncate() #alloc truncated blocks #to other inode ->submit_io() #INFORMATION LEAK In order to avoid this problem, we must serialize unlocked dio reads with truncate. There are two approaches: - use extent lock to protect the extent that we truncate - use inode_dio_wait() to make sure the truncating task will wait for the read DIO. If we use the 1st one, we will meet the endless truncation problem due to the nonlocked read DIO after we implement the nonlocked write DIO. It is because we still need invoke inode_dio_wait() avoid the race between write DIO and truncation. By that time, we have to introduce btrfs_inode_{block, resume}_nolock_dio() again. That is we have to implement this patch again, so I choose the 2nd way to fix the problem. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
author: Miao Xie <miaox@cn.fujitsu.com> 2013-02-08 07:01:08 +0000
committer: Josef Bacik <jbacik@fusionio.com> 2013-02-20 12:59:47 -0500
commit: 2e60a51e62185cce48758e596ae7cb2da673b58f (patch)
tree: bdbbac16110a3eeda8732c3ffb38a440204e2831 /fs/btrfs/inode.c
parent: 0934856d4697e63c14056375e26e3bd6e8ebd34b (diff)
1 files changed, 21 insertions, 2 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d11f38d8696..c6ee8f1063f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3888,6 +3888,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
 		truncate_setsize(inode, newsize);
+
+		/* Disable nonlocked read DIO to avoid the end less truncate */
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
+
 		ret = btrfs_truncate(inode);
 		if (ret && inode->i_nlink)
 			btrfs_orphan_del(NULL, inode);
@@ -6670,6 +6676,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = 0;
+	int flags = 0;
+	bool wakeup = false;
 	ssize_t ret;
 
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
@@ -6681,13 +6689,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		ret = btrfs_delalloc_reserve_space(inode, count);
 		if (ret)
 			return ret;
+	} else {
+		atomic_inc(&inode->i_dio_count);
+		smp_mb__after_atomic_inc();
+		if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				      &BTRFS_I(inode)->runtime_flags))) {
+			inode_dio_done(inode);
+			flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		} else {
+			wakeup = true;
+		}
 	}
 
 	ret = __blockdev_direct_IO(rw, iocb, inode,
 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
 			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-			btrfs_submit_direct, 0);
-
+			btrfs_submit_direct, flags);
 	if (rw & WRITE) {
 		if (ret < 0 && ret != -EIOCBQUEUED)
 			btrfs_delalloc_release_space(inode, count);
@@ -6700,6 +6717,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		}
 		btrfs_delalloc_release_metadata(inode, 0);
 	}
+	if (wakeup)
+		inode_dio_done(inode);
 
 	return ret;
 }
author	Miao Xie <miaox@cn.fujitsu.com>	2013-02-08 07:01:08 +0000
committer	Josef Bacik <jbacik@fusionio.com>	2013-02-20 12:59:47 -0500
commit	2e60a51e62185cce48758e596ae7cb2da673b58f (patch)
tree	bdbbac16110a3eeda8732c3ffb38a440204e2831 /fs/btrfs/inode.c
parent	0934856d4697e63c14056375e26e3bd6e8ebd34b (diff)