From 3c64a1aba7cfcb04f79e76f859b3d66660275d59 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 13 May 2013 13:53:35 +0000 Subject: Btrfs: cleanup: don't check the same thing twice btrfs_read_fs_root_no_name() already checks if btrfs_root_refs() is zero and returns ENOENT in this case. There is no need to do it again in six places. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b3e359bc8e6..185af15ad9e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -308,10 +308,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } - if (btrfs_root_refs(&inode_root->root_item) == 0) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); -- cgit v1.2.3-18-g5258 From a71754fc68f740b7ed46bb83123c63fbbc130611 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 17 Jun 2013 17:14:39 -0400 Subject: Btrfs: move btrfs_truncate_page to btrfs_cont_expand instead of btrfs_truncate This has plagued us forever and I'm so over working around it. When we truncate down to a non-page aligned offset we will call btrfs_truncate_page to zero out the end of the page and write it back to disk, this will keep us from exposing stale data if we truncate back up from that point. The problem with this is it requires data space to do this, and people don't really expect to get ENOSPC from truncate() for these sort of things. This also tends to bite the orphan cleanup stuff too which keeps people from mounting. To get around this we can just move this into btrfs_cont_expand() to make sure if we are truncating up from a non-page size aligned i_size we will zero out the rest of this page so that we don't expose stale data. This will give ENOSPC if you try to truncate() up or if you try to write past the end of isize, which is much more reasonable. This fixes xfstests generic/083 failing to mount because of the orphan cleanup failing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 185af15ad9e..5ffde560368 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2173,12 +2173,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out_reserve_fail; } - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); if (ret) @@ -2189,8 +2183,23 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start); if (ret) goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; } + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; -- cgit v1.2.3-18-g5258 From 7ee9e4405f264e9eda808aa5ca4522746a1af9c1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jun 2013 16:37:03 -0400 Subject: Btrfs: check if we can nocow if we don't have data space We always just try and reserve data space when we write, but if we are out of space but have prealloc'ed extents we should still successfully write. This patch will try and see if we can write to prealloc'ed space and if we can go ahead and allow the write to continue. With this patch we now pass xfstests generic/274. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 114 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5ffde560368..2d70849cec9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1312,6 +1312,56 @@ fail: } +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + return PTR_ERR(trans); + } + + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL, + NULL); + btrfs_end_transaction(trans, root); + if (ret <= 0) { + ret = 0; + } else { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + NULL, GFP_NOFS); + *write_bytes = min_t(size_t, *write_bytes, num_bytes); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; +} + static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) @@ -1319,10 +1369,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + u64 release_bytes = 0; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; + bool only_release_metadata = false; bool force_page_uptodate = false; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / @@ -1343,6 +1395,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, offset); size_t num_pages = (write_bytes + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1357,11 +1410,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } + if (ret) break; + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + break; + } + + release_bytes = reserve_bytes; + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the @@ -1370,11 +1453,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ret = prepare_pages(root, file, pages, num_pages, pos, first_index, write_bytes, force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); + if (ret) break; - } copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1404,30 +1484,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); } + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; if (copied > 0) { ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); btrfs_drop_pages(pages, num_pages); break; } } + release_bytes = 0; btrfs_drop_pages(pages, num_pages); + if (only_release_metadata && copied > 0) { + u64 lockstart = round_down(pos, root->sectorsize); + u64 lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; + } + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1440,6 +1536,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); + if (release_bytes) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, release_bytes); + else + btrfs_delalloc_release_space(inode, release_bytes); + } + return num_written ? num_written : ret; } -- cgit v1.2.3-18-g5258 From 46a1c2c7ae53de2a5676754b54a73c591a3951d2 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 25 Jun 2013 12:02:13 +0800 Subject: vfs: export lseek_execute() to modules For those file systems(btrfs/ext4/ocfs2/tmpfs) that support SEEK_DATA/SEEK_HOLE functions, we end up handling the similar matter in lseek_execute() to update the current file offset to the desired offset if it is valid, ceph also does the simliar things at ceph_llseek(). To reduce the duplications, this patch make lseek_execute() public accessible so that we can call it directly from the underlying file systems. Thanks Dave Chinner for this suggestion. [AV: call it vfs_setpos(), don't bring the removed 'inode' argument back] v2->v1: - Add kernel-doc comments for lseek_execute() - Call lseek_execute() in ceph->llseek() Signed-off-by: Jie Liu Cc: Dave Chinner Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Christoph Hellwig Cc: Chris Mason Cc: Josef Bacik Cc: Ben Myers Cc: Ted Tso Cc: Hugh Dickins Cc: Mark Fasheh Cc: Joel Becker Cc: Sage Weil Signed-off-by: Al Viro --- fs/btrfs/file.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4205ba752d4..89da56a58b6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2425,20 +2425,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); return offset; -- cgit v1.2.3-18-g5258 From ee20a98314e52a6675e94d1a07ca205ffdf09a72 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Jul 2013 10:34:59 -0400 Subject: Btrfs: allow splitting of hole em's when dropping extent cache I noticed while running multi-threaded fsync tests that sometimes fsck would complain about an improper gap. This happens because we fail to add a hole extent to the file, which was happening when we'd split a hole EM because btrfs_drop_extent_cache was just discarding the whole em instead of splitting it. So this patch fixes this by allowing us to split a hole em properly, which means that added holes actually get logged properly and we no longer see this fsck error. Thankfully we're tolerant of these sort of problems so a user would not see any adverse effects of this bug, other than fsck complaining. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 62 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a005fe2c072..8e686a427ce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -596,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -620,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -630,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } ret = add_extent_mapping(em_tree, split, modified); -- cgit v1.2.3-18-g5258