From d88d46c6e06cb47cd3b951287ccaf414e96560d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Jun 2013 12:59:04 +0000 Subject: Btrfs: free csums when we're done scrubbing an extent A user reported scrub taking up an unreasonable amount of ram as it ran. This is because we lookup the csums for the extent we're scrubbing but don't free it up until after we're done with the scrub, which means we can take up a whole lot of ram. This patch fixes this by dropping the csums once we're done with the extent we've scrubbed. The user reported this to fix their problem. Thanks, Reported-and-tested-by: Remco Hosman Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 79bd479317c..cb308a3a930 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2505,6 +2505,7 @@ again: if (ret) goto out; + scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logical += increment; -- cgit v1.2.3-18-g5258 From f51a4a1826ff810eb9c00cadff8978b028c40756 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Jun 2013 10:36:09 +0800 Subject: Btrfs: remove btrfs_sector_sum structure Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s -> 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cb308a3a930..63144e4ca9e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; + unsigned long index; unsigned long num_sectors; while (!list_empty(&sctx->csum_list)) { @@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, if (!sum) return 0; + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; num_sectors = sum->len / sctx->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sctx->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); } - return ret; + return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ -- cgit v1.2.3-18-g5258 From 26b258919006fc2d76a50b8247d7dea73207b583 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 27 Jun 2013 18:50:58 +0800 Subject: Btrfs: fix oops when recovering the file data by scrub function We get oops while running btrfs replace start test, ------------[ cut here ]------------ kernel BUG at mm/filemap.c:608! [SNIP] Call Trace: [] copy_nocow_pages_for_inode+0x217/0x3f0 [btrfs] [] ? scrub_print_warning_inode+0x230/0x230 [btrfs] [] ? scrub_print_warning_inode+0x230/0x230 [btrfs] [] iterate_extent_inodes+0x1ae/0x300 [btrfs] [] iterate_inodes_from_logical+0x92/0xb0 [btrfs] [] ? scrub_print_warning_inode+0x230/0x230 [btrfs] [] copy_nocow_pages_worker+0x97/0x150 [btrfs] [] worker_loop+0x134/0x540 [btrfs] [] ? __schedule+0x3ca/0x7f0 [] ? btrfs_queue_worker+0x300/0x300 [btrfs] [] kthread+0xc0/0xd0 [] ? flush_kthread_worker+0x80/0x80 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0x80/0x80 [SNIP] RIP [] unlock_page+0x35/0x40 RSP ---[ end trace 421e79ad0dd72c7d ]--- it is because we forgot to lock the page again after we read data to the page. Fix it. Signed-off-by: Lin Feng Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 63144e4ca9e..c1647f8c1cd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3258,7 +3258,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ret = ret_sub; goto next_page; } - wait_on_page_locked(page); + lock_page(page); if (!PageUptodate(page)) { ret = -EIO; goto next_page; -- cgit v1.2.3-18-g5258 From 826aa0a82c5b9d2c8016c02b552e8f82f5b1e660 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 27 Jun 2013 18:50:59 +0800 Subject: Btrfs: cleanup the code of copy_nocow_pages_for_inode() - It make no sense that we continue to do something after the error happened, just go back with this patch. - remove some check of copy_nocow_pages_for_inode(), such as page check after write, inode check in the end of the function, because we are sure they exist. - remove the unnecessary goto in the return value check of the write Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c1647f8c1cd..186ea82b75f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3199,16 +3199,18 @@ out: static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) { - unsigned long index; struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - int ret = 0; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct page *page; struct btrfs_root *local_root; u64 physical_for_dev_replace; u64 len; - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + unsigned long index; int srcu_index; + int ret; + int err; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3230,19 +3232,17 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); + ret = 0; physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; len = nocow_ctx->len; while (len >= PAGE_CACHE_SIZE) { - struct page *page = NULL; - int ret_sub; - index = offset >> PAGE_CACHE_SHIFT; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { pr_err("find_or_create_page() failed\n"); ret = -ENOMEM; - goto next_page; + goto out; } if (PageUptodate(page)) { @@ -3250,12 +3250,12 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } else { ClearPageError(page); - ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + err = extent_read_full_page(&BTRFS_I(inode)-> io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); - if (ret_sub) { - ret = ret_sub; + if (err) { + ret = err; goto next_page; } lock_page(page); @@ -3264,25 +3264,23 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } } - ret_sub = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (ret_sub) { - ret = ret_sub; - goto next_page; - } - + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; next_page: - if (page) { - unlock_page(page); - put_page(page); - } + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } - - if (inode) - iput(inode); +out: + iput(inode); return ret; } -- cgit v1.2.3-18-g5258 From edd1400be9f983f521c7397740d810fa210ee52f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 27 Jun 2013 18:51:00 +0800 Subject: Btrfs: fix several potential problems in copy_nocow_pages_for_inode - It makes no sense that we deal with a inode in the dead tree. - fix the race between dio and page copy by waiting the dio completion - avoid the page copy vs truncate/punch hole - check if the page is in the page cache or not Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 186ea82b75f..4ba2a69a60a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3224,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) return PTR_ERR(local_root); } + if (btrfs_root_refs(&local_root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return -ENOENT; + } + key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; @@ -3232,12 +3237,16 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + ret = 0; physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; len = nocow_ctx->len; while (len >= PAGE_CACHE_SIZE) { index = offset >> PAGE_CACHE_SHIFT; - +again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { pr_err("find_or_create_page() failed\n"); @@ -3258,7 +3267,18 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) ret = err; goto next_page; } + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + page_cache_release(page); + goto again; + } if (!PageUptodate(page)) { ret = -EIO; goto next_page; @@ -3280,6 +3300,7 @@ next_page: len -= PAGE_CACHE_SIZE; } out: + mutex_unlock(&inode->i_mutex); iput(inode); return ret; } -- cgit v1.2.3-18-g5258 From 115930cb2d444a684975cf2325759cb48ebf80cc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 4 Jul 2013 16:14:23 +0200 Subject: Btrfs: fix wrong write offset when replacing a device Miao Xie reported the following issue: The filesystem was corrupted after we did a device replace. Steps to reproduce: # mkfs.btrfs -f -m single -d raid10 .. # mount # btrfs replace start -rfB 1 # umount # btrfsck The reason for the issue is that we changed the write offset by mistake, introduced by commit 625f1c8dc. We read the data from the source device at first, and then write the data into the corresponding place of the new device. In order to implement the "-r" option, the source location is remapped using btrfs_map_block(). The read takes place on the mapped location, and the write needs to take place on the unmapped location. Currently the write is using the mapped location, and this commit changes it back by undoing the change to the write address that the aforementioned commit added by mistake. Reported-by: Miao Xie Cc: # 3.10+ Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4ba2a69a60a..64a157becbe 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2495,7 +2495,7 @@ again: ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - extent_physical); + extent_logical - logical + physical); if (ret) goto out; -- cgit v1.2.3-18-g5258