diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 1836 |
1 files changed, 1333 insertions, 503 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69..bdbb94f245c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 STRATO. All rights reserved. + * Copyright (C) 2011, 2012 STRATO. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" @@ -42,10 +43,23 @@ */ struct scrub_block; -struct scrub_dev; +struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -56,6 +70,8 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + u64 physical_for_dev_replace; + atomic_t ref_count; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -66,23 +82,28 @@ struct scrub_page { struct scrub_bio { int index; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; + struct btrfs_device *dev; struct bio *bio; int err; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif int page_count; int next_free; struct btrfs_work work; }; struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct { unsigned int header_error:1; unsigned int checksum_error:1; @@ -91,23 +112,35 @@ struct scrub_block { }; }; -struct scrub_dev { - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; - struct btrfs_device *dev; +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; + struct btrfs_root *dev_root; int first_free; int curr; - atomic_t in_flight; - atomic_t fixup_cnt; + atomic_t bios_in_flight; + atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + int pages_per_rd_bio; u32 sectorsize; u32 nodesize; u32 leafsize; + + int is_dev_replace; + struct scrub_wr_ctx wr_ctx; + /* * statistics */ @@ -116,13 +149,23 @@ struct scrub_dev { }; struct scrub_fixup_nodatasum { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; + struct btrfs_device *dev; u64 logical; struct btrfs_root *root; struct btrfs_work work; int mirror_num; }; +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct btrfs_work work; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -137,15 +180,20 @@ struct scrub_warning { }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, + struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, - struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); + struct scrub_block *sblocks_for_recheck); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, - struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sctx->workers_pending); +} +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; -static void scrub_free_csums(struct scrub_dev *sdev) + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); +} + +static void scrub_free_csums(struct scrub_ctx *sctx) { - while (!list_empty(&sdev->csum_list)) { + while (!list_empty(&sctx->csum_list)) { struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sdev->csum_list, + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); list_del(&sum->list); kfree(sum); } } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; - if (!sdev) + if (!sctx) return; + scrub_free_wr_ctx(&sctx->wr_ctx); + /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); + WARN_ON(!sbio->pagev[i]->page); scrub_block_put(sbio->pagev[i]->sblock); } bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio = sdev->bios[i]; + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) break; kfree(sbio); } - scrub_free_csums(sdev); - kfree(sdev); + scrub_free_csums(sctx); + kfree(sctx); } static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; + int pages_per_rd_bio; + int ret; - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); - if (!sdev) + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) goto nomem; - sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + sctx->is_dev_replace = is_dev_replace; + sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->curr = -1; + sctx->dev_root = dev->dev_root; + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); if (!sbio) goto nomem; - sdev->bios[i] = sbio; + sctx->bios[i] = sbio; sbio->index = i; - sbio->sdev = sdev; + sbio->sctx = sctx; sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_DEV-1) - sdev->bios[i]->next_free = i + 1; + if (i != SCRUB_BIOS_PER_SCTX - 1) + sctx->bios[i]->next_free = i + 1; else - sdev->bios[i]->next_free = -1; - } - sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sdev->in_flight, 0); - atomic_set(&sdev->fixup_cnt, 0); - atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sdev->csum_list); - - spin_lock_init(&sdev->list_lock); - spin_lock_init(&sdev->stat_lock); - init_waitqueue_head(&sdev->list_wait); - return sdev; + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->leafsize = dev->dev_root->leafsize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } + return sctx; nomem: - scrub_free_dev(sdev); + scrub_free_ctx(sctx); return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) { u64 isize; u32 nlink; @@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) int i; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct scrub_warning *swarn = ctx; + struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; @@ -345,8 +496,8 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sdev->dev; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) const int bufsize = 4096; int ret; + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0]->dev; + fs_info = sblock->sctx->dev_root->fs_info; + path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; - swarn.dev = dev; + swarn.dev = NULL; swarn.msg_bufsize = bufsize; swarn.scratch_bufsize = bufsize; @@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); } else { swarn.path = path; + swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, scrub_print_warning_inode, &swarn); @@ -416,11 +571,11 @@ out: kfree(swarn.msg_buf); } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) { struct page *page = NULL; unsigned long index; - struct scrub_fixup_nodatasum *fixup = ctx; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; int ret; int corrected = 0; struct btrfs_key key; @@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) } if (PageUptodate(page)) { - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) ret = -EIO; goto out; } - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fs_info = BTRFS_I(inode)->root->fs_info; + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, fixup->logical, page, fixup->mirror_num); unlock_page(page); @@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) { int ret; struct scrub_fixup_nodatasum *fixup; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sdev = fixup->sdev; + sctx = fixup->sctx; fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.malloc_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); uncorrectable = 1; goto out; } @@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) } WARN_ON(ret != 1); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fixup->root); if (uncorrectable) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); - + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); kfree(fixup); - /* see caller why we're pretending to be paused in the scrub counters */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sdev->fixup_cnt); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sdev->list_wait); + scrub_pending_trans_workers_dec(sctx); } /* @@ -614,7 +764,8 @@ out: */ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sblock_to_check->sdev; + struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; + fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; + + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } /* * read all mirrors one after the other. This includes to @@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, - BTRFS_DEV_STAT_READ_ERRS); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, logical, sblocks_for_recheck); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, - BTRFS_DEV_STAT_READ_ERRS); + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); sblock_bad = sblocks_for_recheck + failed_mirror_index; /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); - if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, - BTRFS_DEV_STAT_READ_ERRS); - goto out; - } + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * different bio (usually one of the two latter cases is * the cause) */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + spin_unlock(&sctx->stat_lock); + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); goto out; } if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sdev->readonly) + if (sctx->readonly && !sctx->is_dev_replace) goto did_not_correct_error; if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: + WARN_ON(sctx->is_dev_replace); + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); if (!fixup_nodatasum) goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; + fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a fixup worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the fixup worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sdev->fixup_cnt); + scrub_pending_trans_workers_inc(sctx); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (ret) - goto did_not_correct_error; - } - - /* - * first try to pick the mirror which is completely without I/O + * checksums. + * First try to pick the mirror which is completely without I/O * errors and also does not have a checksum error. * If one is found, and if a checksum is present, the full block * that is known to contain an error is rewritten. Afterwards @@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; + struct scrub_block *sblock_other; + + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + } else { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other, + force_write); + } if (0 == ret) goto corrected_error; } } /* - * in case of I/O errors in the area that is supposed to be + * for dev_replace, pick good pages and write to the target device. + */ + if (sctx->is_dev_replace) { + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + int sub_success; + + sub_success = 0; + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = + sblocks_for_recheck + mirror_index; + struct scrub_page *page_other = + sblock_other->pagev[page_num]; + + if (!page_other->io_error) { + ret = scrub_write_page_to_dev_replace( + sblock_other, page_num); + if (ret == 0) { + /* succeeded for this page */ + sub_success = 1; + break; + } else { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + } + } + } + + if (!sub_success) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + success = 0; + ret = scrub_write_page_to_dev_replace( + sblock_bad, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + } + } + + goto out; + } + + /* + * for regular scrub, repair those pages that are errored. + * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from * the area to fix. Afterwards verify the checksum of the block @@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; if (!page_bad->io_error) continue; @@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index++) { struct scrub_block *sblock_other = sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; + struct scrub_page *page_other = sblock_other->pagev[ + page_num]; if (!page_other->io_error) { ret = scrub_repair_page_from_good_copy( @@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * is verified, but most likely the data comes out * of the page cache. */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (!ret && !sblock_bad->header_error && + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size); + if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) goto corrected_error; @@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto did_not_correct_error; } else { corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(dev->name)); } } else { did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(dev->name)); } out: @@ -966,11 +1166,11 @@ out: mirror_index; int page_index; - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + scrub_page_put(sblock->pagev[page_index]); + } } kfree(sblocks_for_recheck); } @@ -978,8 +1178,9 @@ out: return 0; } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, + struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, int ret; /* - * note: the three members sdev, ref_count and outstanding_pages + * note: the two members ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, * with a length of PAGE_SIZE, each retu |