diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
| -rw-r--r-- | fs/btrfs/scrub.c | 816 |
1 files changed, 504 insertions, 312 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c..b6d198f5181 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" +#include "raid56.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -157,12 +158,20 @@ struct scrub_fixup_nodatasum { int mirror_num; }; +struct scrub_nocow_inode { + u64 inum; + u64 offset; + u64 root; + struct list_head list; +}; + struct scrub_copy_nocow_ctx { struct scrub_ctx *sctx; u64 logical; u64 len; int mirror_num; u64 physical_for_dev_replace; + struct list_head inodes; struct btrfs_work work; }; @@ -199,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write); @@ -244,10 +252,12 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, - void *ctx); + struct scrub_copy_nocow_ctx *ctx); static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace); static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -261,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) wake_up(&sctx->list_wait); } +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -282,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + atomic_inc(&sctx->workers_pending); } @@ -385,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, + NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -472,7 +516,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -484,7 +528,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -541,14 +585,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); printk_in_rcu(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " + "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, rcu_str_deref(dev->name), @@ -557,7 +601,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, @@ -580,20 +626,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +661,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -664,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) out: if (page) put_page(page); - if (inode) - iput(inode); + + iput(inode); if (ret < 0) return ret; @@ -687,13 +741,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) struct scrub_fixup_nodatasum *fixup; struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); sctx = fixup->sctx; - fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { @@ -742,10 +794,9 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, - rcu_str_deref(fixup->dev->name)); + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + fixup->logical, rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -921,8 +972,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly && !sctx->is_dev_replace) - goto did_not_correct_error; + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; @@ -946,9 +999,10 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); + btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, + NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); goto out; } @@ -1143,9 +1197,8 @@ corrected_error: sctx->stat.corrected_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + "BTRFS: fixed up error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -1153,9 +1206,8 @@ did_not_correct_error: sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); } out: @@ -1277,7 +1329,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; struct scrub_page *page = sblock->pagev[page_num]; - DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { page->io_error = 1; @@ -1286,26 +1337,19 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { page->io_error = 1; sblock->no_io_error_seen = 0; continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_sector = page->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; + bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); - btrfsic_submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(READ, bio)) sblock->no_io_error_seen = 0; + bio_put(bio); } @@ -1326,7 +1370,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int page_num; u8 calculated_csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; WARN_ON(!sblock->pagev[0]->page); @@ -1336,12 +1379,12 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; - } else if (generation != le64_to_cpu(h->generation)) { + } else if (generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } @@ -1355,12 +1398,11 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, for (page_num = 0;;) { if (page_num == 0 && is_metadata) - crc = btrfs_csum_data(root, + crc = btrfs_csum_data( ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, crc, PAGE_SIZE - BTRFS_CSUM_SIZE); else - crc = btrfs_csum_data(root, mapped_buffer, crc, - PAGE_SIZE); + crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); kunmap_atomic(mapped_buffer); page_num++; @@ -1376,11 +1418,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, sblock->checksum_error = 1; } -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write) @@ -1415,32 +1452,27 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; int ret; - DECLARE_COMPLETION_ONSTACK(complete); if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING - "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; - bio->bi_sector = page_bad->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; + bio->bi_iter.bi_sector = page_bad->physical >> 9; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { bio_put(bio); return -EIO; } - btrfsic_submit_bio(WRITE, bio); - /* this will also unplug the queue */ - wait_for_completion(&complete); - if (!bio_flagged(bio, BIO_UPTODATE)) { + if (btrfsic_submit_bio_wait(WRITE, bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1514,7 +1546,7 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1525,7 +1557,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1584,8 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - sbio->work.func = scrub_wr_bio_end_io_worker; - btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) @@ -1647,7 +1679,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1664,7 +1695,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(root, buffer, crc, l); + crc = btrfs_csum_data(buffer, crc, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1713,10 +1744,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) ++fail; - if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1734,7 +1765,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1779,10 +1810,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != btrfs_super_generation(s)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1795,7 +1826,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1883,7 +1914,7 @@ static void scrub_submit(struct scrub_ctx *sctx) * This case is handled correctly (but _very_ slowly). */ printk_ratelimited(KERN_WARNING - "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); bio_endio(sbio->bio, -EIO); } else { btrfsic_submit_bio(READ, sbio->bio); @@ -1923,7 +1954,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1932,7 +1963,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2054,7 +2085,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); } static void scrub_bio_end_io_worker(struct btrfs_work *work) @@ -2119,8 +2150,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; + unsigned long index; unsigned long num_sectors; while (!list_empty(&sctx->csum_list)) { @@ -2139,19 +2169,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, if (!sum) return 0; + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; num_sectors = sum->len / sctx->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sctx->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); } - return ret; + return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ @@ -2211,6 +2236,47 @@ behind_scrub_pages: return 0; } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -2226,12 +2292,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 flags; int ret; int slot; - int i; u64 nstripes; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; + u64 logic_end; + u64 physical_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2245,8 +2312,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; + int stop_loop = 0; nstripes = length; + physical = map->stripes[num].physical; offset = 0; do_div(nstripes, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2264,6 +2333,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; } else { increment = map->stripe_len; mirror_num = 1; @@ -2287,19 +2361,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * to not hold off transaction commits */ logical = base + offset; - + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ key_start.objectid = logical; key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; - key_end.type = BTRFS_EXTENT_ITEM_KEY; - key_end.offset = (u64)0; + key_end.objectid = logic_end; + key_end.type = BTRFS_METADATA_ITEM_KEY; + key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -2307,7 +2388,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; + key_end.offset = logic_end; reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); if (!IS_ERR(reada1)) @@ -2315,16 +2396,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!IS_ERR(reada2)) btrfs_reada_wait(reada2); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during @@ -2335,10 +2406,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* * now find all extents for each stripe and scrub them */ - logical = base + offset; - physical = map->stripes[num].physical; ret = 0; - for (i = 0; i < nstripes; ++i) { + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } /* * canceled? */ @@ -2360,36 +2438,22 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); } - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + if (ret > 0) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_previous_extent_item(root, path, 0); if (ret < 0) goto out; if (ret > 0) { @@ -2403,7 +2467,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } } + stop_loop = 0; while (1) { + u64 bytes; + l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -2413,19 +2480,30 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (ret < 0) goto out; + stop_loop = 1; break; } btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid + key.offset <= logical) - goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->leafsize; + else + bytes = key.offset; - if (key.objectid >= logical + map->stripe_len) - break; + if (key.objectid + bytes <= logical) + goto next; - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) goto next; + if (key.objectid >= logical + map->stripe_len) { + /* out of this device extent */ + if (key.objectid >= logic_end) + stop_loop = 1; + break; + } + extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(l, extent); @@ -2433,30 +2511,31 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid < logical && (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - printk(KERN_ERR - "btrfs scrub: tree block %llu spanning " - "stripes, ignored. logical=%llu\n", - (unsigned long long)key.objectid, - (unsigned long long)logical); + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", + key.objectid, logical); goto next; } +again: + extent_logical = key.objectid; + extent_len = bytes; + /* * trim extent to this stripe */ - if (key.objectid < logical) { - key.offset -= logical - key.objectid; - key.objectid = logical; + if (extent_logical < logical) { + extent_len -= logical - extent_logical; + extent_logical = logical; } - if (key.objectid + key.offset > + if (extent_logical + extent_len > logical + map->stripe_len) { - key.offset = logical + map->stripe_len - - key.objectid; + extent_len = logical + map->stripe_len - + extent_logical; } - extent_logical = key.objectid; - extent_physical = key.objectid - logical + physical; - extent_len = key.offset; + extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; if (is_dev_replace) @@ -2464,22 +2543,66 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, extent_len, &extent_physical, &extent_dev, &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - key.objectid - logical + physical); + extent_logical - logical + physical); if (ret) goto out; + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } + if (logical < key.objectid + bytes) { + cond_resched(); + goto again; + } + + if (physical >= physical_end) { + stop_loop = 1; + break; + } + } next: path->slots[0]++; } btrfs_release_path(path); +skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); - sctx->stat.last_physical = physical; + if (stop_loop) + sctx->stat.last_physical = map->stripes[num].physical + + length; + else + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; } out: /* push queued extents */ @@ -2602,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) { - key.offset = found_key.offset + length; - btrfs_release_path(path); - continue; - } + if (found_key.offset + length <= start) + goto skip; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2617,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * the chunk from going away while we scrub it */ cache = btrfs_lookup_block_group(fs_info, chunk_offset); - if (!cache) { - ret = -ENOENT; - break; - } + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -2646,25 +2768,24 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } + __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; @@ -2678,6 +2799,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; +skip: key.offset = found_key.offset + length; btrfs_release_path(path); } @@ -2700,7 +2824,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_root *root = sctx->dev_root; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; gen = root->fs_info->last_trans_committed; @@ -2728,52 +2852,49 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; + int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; - mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, - &fs_info->generic_worker); + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); else - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_workers.idle_thresh = 4; - ret = btrfs_start_workers(&fs_info->scrub_workers); - if (ret) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_wr_completion_workers, - "scrubwrc", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_wr_completion_workers.idle_thresh = 2; - ret = btrfs_start_workers( - &fs_info->scrub_wr_completion_workers); - if (ret) + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, - &fs_info->generic_worker); - ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); - if (ret) + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; goto out; + } } ++fs_info->scrub_workers_refcnt; out: - mutex_unlock(&fs_info->scrub_lock); - return ret; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_stop_workers(&fs_info->scrub_workers); - btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); - btrfs_stop_workers(&fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); - mutex_unlock(&fs_info->scrub_lock); } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -2791,8 +2912,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * check some assumptions */ if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize == leafsize (%d == %d) fails", fs_info->chunk_root->nodesize, fs_info->chunk_root->leafsize); return -EINVAL; @@ -2804,18 +2925,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * the way scrub is implemented. Do not handle this * situation at all because it won't ever happen. */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - fs_info->chunk_root->sectorsize, - (unsigned long long)PAGE_SIZE); + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", + fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -2827,7 +2948,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * would exhaust the array bounds of pagev member in * struct scrub_block */ - pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", fs_info->chunk_root->nodesize, SCRUB_MAX_PAGES_PER_BLOCK, fs_info->chunk_root->sectorsize, @@ -2835,23 +2957,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) - return ret; mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -ENODEV; } - mutex_lock(&fs_info->scrub_lock); + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EIO; } @@ -2862,10 +2979,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); return -EINPROGRESS; } btrfs_dev_replace_unlock(&fs_info->dev_replace); + + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return ret; + } + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); @@ -2875,15 +2999,24 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!is_dev_replace) { - down_read(&fs_info->scrub_super_lock); + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); } if (!ret) @@ -2901,10 +3034,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; + scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(fs_info); return ret; } @@ -2934,16 +3067,6 @@ void btrfs_scrub_continue(struct btrfs_root *root) wake_up(&fs_info->scrub_pause_wait); } -void btrfs_scrub_pause_super(struct btrfs_root *root) -{ - down_write(&root->fs_info->scrub_super_lock); -} - -void btrfs_scrub_continue_super(struct btrfs_root *root) -{ - up_write(&root->fs_info->scrub_super_lock); -} - int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); @@ -2988,28 +3111,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device *dev; - int ret; - - /* - * we have to hold the device_list_mutex here so the device - * does not go away in cancel_dev. FIXME: find a better solution - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; - } - ret = btrfs_scrub_cancel_dev(fs_info, dev); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - return ret; -} - int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { @@ -3102,13 +3203,31 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - nocow_ctx->work.func = copy_nocow_pages_worker; - btrfs_queue_worker(&fs_info->scrub_nocow_workers, - &nocow_ctx->work); + btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + INIT_LIST_HEAD(&nocow_ctx->inodes); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + struct scrub_nocow_inode *nocow_inode; + nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); + if (!nocow_inode) + return -ENOMEM; + nocow_inode->inum = inum; + nocow_inode->offset = offset; + nocow_inode->root = root; + list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); return 0; } +#define COPY_COMPLETE 1 + static void copy_nocow_pages_worker(struct btrfs_work *work) { struct scrub_copy_nocow_ctx *nocow_ctx = @@ -3144,19 +3263,43 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) } ret = iterate_inodes_from_logical(logical, fs_info, path, - copy_nocow_pages_for_inode, - nocow_ctx); + record_inode_for_nocow, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", - (unsigned long long)logical, - (unsigned long long)physical_for_dev_replace, - (unsigned long long)len, - (unsigned long long)mirror_num, ret); + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", + logical, physical_for_dev_replace, len, mirror_num, + ret); not_written = 1; goto out; } + btrfs_end_transaction(trans, root); + trans = NULL; + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, + entry->root, nocow_ctx); + kfree(entry); + if (ret == COPY_COMPLETE) { + ret = 0; + break; + } else if (ret) { + break; + } + } out: + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + kfree(entry); + } if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, root); if (not_written) @@ -3169,45 +3312,85 @@ out: scrub_pending_trans_workers_dec(sctx); } -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *nocow_ctx) { - unsigned long index; - struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - int ret = 0; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct page *page; struct btrfs_root *local_root; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct extent_io_tree *io_tree; u64 physical_for_dev_replace; - u64 len; - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + u64 len = nocow_ctx->len; + u64 lockstart = offset, lockend = offset + len - 1; + unsigned long index; + int srcu_index; + int ret = 0; + int err = 0; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; - len = nocow_ctx->len; - while (len >= PAGE_CACHE_SIZE) { - struct page *page = NULL; - int ret_sub; + io_tree = &BTRFS_I(inode)->io_tree; - index = offset >> PAGE_CACHE_SHIFT; + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > nocow_ctx->logical || + em->block_start + em->block_len < nocow_ctx->logical + len) { + free_extent_map(em); + goto out_unlock; + } + free_extent_map(em); + + while (len >= PAGE_CACHE_SIZE) { + index = offset >> PAGE_CACHE_SHIFT; +again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { - pr_err("find_or_create_page() failed\n"); + btrfs_err(fs_info, "find_or_create_page() failed"); ret = -ENOMEM; - goto next_page; + goto out; } if (PageUptodate(page)) { @@ -3215,39 +3398,53 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } else { ClearPageError(page); - ret_sub = extent_read_full_page(&BTRFS_I(inode)-> - io_tree, - page, btrfs_get_extent, - nocow_ctx->mirror_num); - if (ret_sub) { - ret = ret_sub; + err = extent_read_full_page_nolock(io_tree, page, + btrfs_get_extent, + nocow_ctx->mirror_num); + if (err) { + ret = err; goto next_page; } - wait_on_page_locked(page); + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } if (!PageUptodate(page)) { ret = -EIO; goto next_page; } } - ret_sub = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (ret_sub) { - ret = ret_sub; - goto next_page; - } - + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; next_page: - if (page) { - unlock_page(page); - put_page(page); - } + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } - - if (inode) - iput(inode); + ret = COPY_COMPLETE; +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + iput(inode); return ret; } @@ -3257,27 +3454,24 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct bio *bio; struct btrfs_device *dev; int ret; - DECLARE_COMPLETION_ONSTACK(compl); dev = sctx->wr_ctx.tgtdev; if (!dev) return -EIO; if (!dev->bdev) { printk_ratelimited(KERN_WARNING - "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); return -ENOMEM; } - bio->bi_private = &compl; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_size = 0; - bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); if (ret != PAGE_CACHE_SIZE) { @@ -3286,10 +3480,8 @@ leave_with_eio: btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) goto leave_with_eio; bio_put(bio); |
