diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
| -rw-r--r-- | fs/btrfs/scrub.c | 444 | 
1 files changed, 255 insertions, 189 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a18e0e23f6a..b6d198f5181 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  					 int is_metadata, int have_csum,  					 const u8 *csum, u64 generation,  					 u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err);  static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,  					     struct scrub_block *sblock_good,  					     int force_write); @@ -257,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,  static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,  			    int mirror_num, u64 physical_for_dev_replace);  static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);  static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -270,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)  	wake_up(&sctx->list_wait);  } +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ +	while (atomic_read(&fs_info->scrub_pause_req)) { +		mutex_unlock(&fs_info->scrub_lock); +		wait_event(fs_info->scrub_pause_wait, +		   atomic_read(&fs_info->scrub_pause_req) == 0); +		mutex_lock(&fs_info->scrub_lock); +	} +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ +	atomic_inc(&fs_info->scrubs_paused); +	wake_up(&fs_info->scrub_pause_wait); + +	mutex_lock(&fs_info->scrub_lock); +	__scrub_blocked_if_needed(fs_info); +	atomic_dec(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); + +	wake_up(&fs_info->scrub_pause_wait); +} +  /*   * used for workers that require transaction commits (i.e., for the   * NOCOW case) @@ -291,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)  	atomic_inc(&fs_info->scrubs_running);  	atomic_inc(&fs_info->scrubs_paused);  	mutex_unlock(&fs_info->scrub_lock); + +	/* +	 * check if @scrubs_running=@scrubs_paused condition +	 * inside wait_event() is not an atomic operation. +	 * which means we may inc/dec @scrub_running/paused +	 * at any time. Let's wake up @scrub_pause_wait as +	 * much as we can to let commit transaction blocked less. +	 */ +	wake_up(&fs_info->scrub_pause_wait); +  	atomic_inc(&sctx->workers_pending);  } @@ -394,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)  		sbio->index = i;  		sbio->sctx = sctx;  		sbio->page_count = 0; -		sbio->work.func = scrub_bio_end_io_worker; +		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, +				NULL, NULL);  		if (i != SCRUB_BIOS_PER_SCTX - 1)  			sctx->bios[i]->next_free = i + 1; @@ -481,7 +516,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  	 * hold all of the paths here  	 */  	for (i = 0; i < ipath->fspath->elem_cnt; ++i) -		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " +		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "  			"%s, sector %llu, root %llu, inode %llu, offset %llu, "  			"length %llu, links %u (path: %s)\n", swarn->errstr,  			swarn->logical, rcu_str_deref(swarn->dev->name), @@ -493,7 +528,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  	return 0;  err: -	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " +	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "  		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "  		"resolving failed with ret=%d\n", swarn->errstr,  		swarn->logical, rcu_str_deref(swarn->dev->name), @@ -553,10 +588,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {  		do { -			ret = tree_backref_for_extent(&ptr, eb, ei, item_size, -							&ref_root, &ref_level); +			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, +						      item_size, &ref_root, +						      &ref_level);  			printk_in_rcu(KERN_WARNING -				"btrfs: %s at logical %llu on dev %s, " +				"BTRFS: %s at logical %llu on dev %s, "  				"sector %llu: metadata %s (level %d) in tree "  				"%llu\n", errstr, swarn.logical,  				rcu_str_deref(dev->name), @@ -682,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  out:  	if (page)  		put_page(page); -	if (inode) -		iput(inode); + +	iput(inode);  	if (ret < 0)  		return ret; @@ -705,13 +741,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)  	struct scrub_fixup_nodatasum *fixup;  	struct scrub_ctx *sctx;  	struct btrfs_trans_handle *trans = NULL; -	struct btrfs_fs_info *fs_info;  	struct btrfs_path *path;  	int uncorrectable = 0;  	fixup = container_of(work, struct scrub_fixup_nodatasum, work);  	sctx = fixup->sctx; -	fs_info = fixup->root->fs_info;  	path = btrfs_alloc_path();  	if (!path) { @@ -760,8 +794,8 @@ out:  		btrfs_dev_replace_stats_inc(  			&sctx->dev_root->fs_info->dev_replace.  			num_uncorrectable_read_errors); -		printk_ratelimited_in_rcu(KERN_ERR -			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", +		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " +		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",  			fixup->logical, rcu_str_deref(fixup->dev->name));  	} @@ -938,8 +972,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} -	if (sctx->readonly && !sctx->is_dev_replace) -		goto did_not_correct_error; +	if (sctx->readonly) { +		ASSERT(!sctx->is_dev_replace); +		goto out; +	}  	if (!is_metadata && !have_csum) {  		struct scrub_fixup_nodatasum *fixup_nodatasum; @@ -963,9 +999,10 @@ nodatasum_case:  		fixup_nodatasum->root = fs_info->extent_root;  		fixup_nodatasum->mirror_num = failed_mirror_index + 1;  		scrub_pending_trans_workers_inc(sctx); -		fixup_nodatasum->work.func = scrub_fixup_nodatasum; -		btrfs_queue_worker(&fs_info->scrub_workers, -				   &fixup_nodatasum->work); +		btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, +				NULL, NULL); +		btrfs_queue_work(fs_info->scrub_workers, +				 &fixup_nodatasum->work);  		goto out;  	} @@ -1160,7 +1197,7 @@ corrected_error:  			sctx->stat.corrected_errors++;  			spin_unlock(&sctx->stat_lock);  			printk_ratelimited_in_rcu(KERN_ERR -				"btrfs: fixed up error at logical %llu on dev %s\n", +				"BTRFS: fixed up error at logical %llu on dev %s\n",  				logical, rcu_str_deref(dev->name));  		}  	} else { @@ -1169,7 +1206,7 @@ did_not_correct_error:  		sctx->stat.uncorrectable_errors++;  		spin_unlock(&sctx->stat_lock);  		printk_ratelimited_in_rcu(KERN_ERR -			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", +			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",  			logical, rcu_str_deref(dev->name));  	} @@ -1292,7 +1329,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,  	for (page_num = 0; page_num < sblock->page_count; page_num++) {  		struct bio *bio;  		struct scrub_page *page = sblock->pagev[page_num]; -		DECLARE_COMPLETION_ONSTACK(complete);  		if (page->dev->bdev == NULL) {  			page->io_error = 1; @@ -1308,19 +1344,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,  			continue;  		}  		bio->bi_bdev = page->dev->bdev; -		bio->bi_sector = page->physical >> 9; -		bio->bi_end_io = scrub_complete_bio_end_io; -		bio->bi_private = &complete; +		bio->bi_iter.bi_sector = page->physical >> 9;  		bio_add_page(bio, page->page, PAGE_SIZE, 0); -		btrfsic_submit_bio(READ, bio); - -		/* this will also unplug the queue */ -		wait_for_completion(&complete); - -		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); -		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) +		if (btrfsic_submit_bio_wait(READ, bio))  			sblock->no_io_error_seen = 0; +  		bio_put(bio);  	} @@ -1389,11 +1418,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		sblock->checksum_error = 1;  } -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ -	complete((struct completion *)bio->bi_private); -} -  static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,  					     struct scrub_block *sblock_good,  					     int force_write) @@ -1428,11 +1452,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  	    sblock_bad->checksum_error || page_bad->io_error) {  		struct bio *bio;  		int ret; -		DECLARE_COMPLETION_ONSTACK(complete);  		if (!page_bad->dev->bdev) { -			printk_ratelimited(KERN_WARNING -				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); +			printk_ratelimited(KERN_WARNING "BTRFS: " +				"scrub_repair_page_from_good_copy(bdev == NULL) " +				"is unexpected!\n");  			return -EIO;  		} @@ -1440,20 +1464,15 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  		if (!bio)  			return -EIO;  		bio->bi_bdev = page_bad->dev->bdev; -		bio->bi_sector = page_bad->physical >> 9; -		bio->bi_end_io = scrub_complete_bio_end_io; -		bio->bi_private = &complete; +		bio->bi_iter.bi_sector = page_bad->physical >> 9;  		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);  		if (PAGE_SIZE != ret) {  			bio_put(bio);  			return -EIO;  		} -		btrfsic_submit_bio(WRITE, bio); -		/* this will also unplug the queue */ -		wait_for_completion(&complete); -		if (!bio_flagged(bio, BIO_UPTODATE)) { +		if (btrfsic_submit_bio_wait(WRITE, bio)) {  			btrfs_dev_stat_inc_and_print(page_bad->dev,  				BTRFS_DEV_STAT_WRITE_ERRS);  			btrfs_dev_replace_stats_inc( @@ -1538,7 +1557,7 @@ again:  		bio->bi_private = sbio;  		bio->bi_end_io = scrub_wr_bio_end_io;  		bio->bi_bdev = sbio->dev->bdev; -		bio->bi_sector = sbio->physical >> 9; +		bio->bi_iter.bi_sector = sbio->physical >> 9;  		sbio->err = 0;  	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=  		   spage->physical_for_dev_replace || @@ -1597,8 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)  	sbio->err = err;  	sbio->bio = bio; -	sbio->work.func = scrub_wr_bio_end_io_worker; -	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); +	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);  }  static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) @@ -1895,7 +1914,7 @@ static void scrub_submit(struct scrub_ctx *sctx)  		 * This case is handled correctly (but _very_ slowly).  		 */  		printk_ratelimited(KERN_WARNING -			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); +			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");  		bio_endio(sbio->bio, -EIO);  	} else {  		btrfsic_submit_bio(READ, sbio->bio); @@ -1944,7 +1963,7 @@ again:  		bio->bi_private = sbio;  		bio->bi_end_io = scrub_bio_end_io;  		bio->bi_bdev = sbio->dev->bdev; -		bio->bi_sector = sbio->physical >> 9; +		bio->bi_iter.bi_sector = sbio->physical >> 9;  		sbio->err = 0;  	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=  		   spage->physical || @@ -2066,7 +2085,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)  	sbio->err = err;  	sbio->bio = bio; -	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);  }  static void scrub_bio_end_io_worker(struct btrfs_work *work) @@ -2217,6 +2236,47 @@ behind_scrub_pages:  	return 0;  } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, +				   struct map_lookup *map, u64 *offset) +{ +	int i; +	int j = 0; +	u64 stripe_nr; +	u64 last_offset; +	int stripe_index; +	int rot; + +	last_offset = (physical - map->stripes[num].physical) * +		      nr_data_stripes(map); +	*offset = last_offset; +	for (i = 0; i < nr_data_stripes(map); i++) { +		*offset = last_offset + i * map->stripe_len; + +		stripe_nr = *offset; +		do_div(stripe_nr, map->stripe_len); +		do_div(stripe_nr, nr_data_stripes(map)); + +		/* Work out the disk rotation on this stripe-set */ +		rot = do_div(stripe_nr, map->num_stripes); +		/* calculate which stripe this data locates */ +		rot += i; +		stripe_index = rot % map->num_stripes; +		if (stripe_index == num) +			return 0; +		if (stripe_index < num) +			j++; +	} +	*offset = last_offset + j * map->stripe_len; +	return 1; +} +  static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  					   struct map_lookup *map,  					   struct btrfs_device *scrub_dev, @@ -2238,6 +2298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	u64 physical;  	u64 logical;  	u64 logic_end; +	u64 physical_end;  	u64 generation;  	int mirror_num;  	struct reada_control *reada1; @@ -2251,16 +2312,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	u64 extent_len;  	struct btrfs_device *extent_dev;  	int extent_mirror_num; -	int stop_loop; - -	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | -			 BTRFS_BLOCK_GROUP_RAID6)) { -		if (num >= nr_data_stripes(map)) { -			return 0; -		} -	} +	int stop_loop = 0;  	nstripes = length; +	physical = map->stripes[num].physical;  	offset = 0;  	do_div(nstripes, map->stripe_len);  	if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2278,6 +2333,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {  		increment = map->stripe_len;  		mirror_num = num % map->num_stripes + 1; +	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6)) { +		get_raid56_logic_offset(physical, num, map, &offset); +		increment = map->stripe_len * nr_data_stripes(map); +		mirror_num = 1;  	} else {  		increment = map->stripe_len;  		mirror_num = 1; @@ -2301,17 +2361,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	 * to not hold off transaction commits  	 */  	logical = base + offset; - +	physical_end = physical + nstripes * map->stripe_len; +	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +			 BTRFS_BLOCK_GROUP_RAID6)) { +		get_raid56_logic_offset(physical_end, num, +					map, &logic_end); +		logic_end += base; +	} else { +		logic_end = logical + increment * nstripes; +	}  	wait_event(sctx->list_wait,  		   atomic_read(&sctx->bios_in_flight) == 0); -	atomic_inc(&fs_info->scrubs_paused); -	wake_up(&fs_info->scrub_pause_wait); +	scrub_blocked_if_needed(fs_info);  	/* FIXME it might be better to start readahead at commit root */  	key_start.objectid = logical;  	key_start.type = BTRFS_EXTENT_ITEM_KEY;  	key_start.offset = (u64)0; -	key_end.objectid = base + offset + nstripes * increment; +	key_end.objectid = logic_end;  	key_end.type = BTRFS_METADATA_ITEM_KEY;  	key_end.offset = (u64)-1;  	reada1 = btrfs_reada_add(root, &key_start, &key_end); @@ -2321,7 +2388,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	key_start.offset = logical;  	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;  	key_end.type = BTRFS_EXTENT_CSUM_KEY; -	key_end.offset = base + offset + nstripes * increment; +	key_end.offset = logic_end;  	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);  	if (!IS_ERR(reada1)) @@ -2329,16 +2396,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	if (!IS_ERR(reada2))  		btrfs_reada_wait(reada2); -	mutex_lock(&fs_info->scrub_lock); -	while (atomic_read(&fs_info->scrub_pause_req)) { -		mutex_unlock(&fs_info->scrub_lock); -		wait_event(fs_info->scrub_pause_wait, -		   atomic_read(&fs_info->scrub_pause_req) == 0); -		mutex_lock(&fs_info->scrub_lock); -	} -	atomic_dec(&fs_info->scrubs_paused); -	mutex_unlock(&fs_info->scrub_lock); -	wake_up(&fs_info->scrub_pause_wait);  	/*  	 * collect all data csums for the stripe to avoid seeking during @@ -2349,11 +2406,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  	/*  	 * now find all extents for each stripe and scrub them  	 */ -	logical = base + offset; -	physical = map->stripes[num].physical; -	logic_end = logical + increment * nstripes;  	ret = 0; -	while (logical < logic_end) { +	while (physical < physical_end) { +		/* for raid56, we skip parity stripe */ +		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +				BTRFS_BLOCK_GROUP_RAID6)) { +			ret = get_raid56_logic_offset(physical, num, +					map, &logical); +			logical += base; +			if (ret) +				goto skip; +		}  		/*  		 * canceled?  		 */ @@ -2375,22 +2438,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  			wait_event(sctx->list_wait,  				   atomic_read(&sctx->bios_in_flight) == 0);  			atomic_set(&sctx->wr_ctx.flush_all_writes, 0); -			atomic_inc(&fs_info->scrubs_paused); -			wake_up(&fs_info->scrub_pause_wait); -			mutex_lock(&fs_info->scrub_lock); -			while (atomic_read(&fs_info->scrub_pause_req)) { -				mutex_unlock(&fs_info->scrub_lock); -				wait_event(fs_info->scrub_pause_wait, -				   atomic_read(&fs_info->scrub_pause_req) == 0); -				mutex_lock(&fs_info->scrub_lock); -			} -			atomic_dec(&fs_info->scrubs_paused); -			mutex_unlock(&fs_info->scrub_lock); -			wake_up(&fs_info->scrub_pause_wait); +			scrub_blocked_if_needed(fs_info);  		} +		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) +			key.type = BTRFS_METADATA_ITEM_KEY; +		else +			key.type = BTRFS_EXTENT_ITEM_KEY;  		key.objectid = logical; -		key.type = BTRFS_EXTENT_ITEM_KEY;  		key.offset = (u64)-1;  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2398,8 +2453,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  			goto out;  		if (ret > 0) { -			ret = btrfs_previous_item(root, path, 0, -						  BTRFS_EXTENT_ITEM_KEY); +			ret = btrfs_previous_extent_item(root, path, 0);  			if (ret < 0)  				goto out;  			if (ret > 0) { @@ -2457,9 +2511,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,  			if (key.objectid < logical &&  			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { -				printk(KERN_ERR -				       "btrfs scrub: tree block %llu spanning " -				       "stripes, ignored. logical=%llu\n", +				btrfs_err(fs_info, +					   "scrub: tree block %llu spanning " +					   "stripes, ignored. logical=%llu",  				       key.objectid, logical);  				goto next;  			} @@ -2506,15 +2560,29 @@ again:  			scrub_free_csums(sctx);  			if (extent_logical + extent_len <  			    key.objectid + bytes) { -				logical += increment; -				physical += map->stripe_len; - +				if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | +					BTRFS_BLOCK_GROUP_RAID6)) { +					/* +					 * loop until we find next data stripe +					 * or we have finished all stripes. +					 */ +					do { +						physical += map->stripe_len; +						ret = get_raid56_logic_offset( +								physical, num, +								map, &logical); +						logical += base; +					} while (physical < physical_end && ret); +				} else { +					physical += map->stripe_len; +					logical += increment; +				}  				if (logical < key.objectid + bytes) {  					cond_resched();  					goto again;  				} -				if (logical >= logic_end) { +				if (physical >= physical_end) {  					stop_loop = 1;  					break;  				} @@ -2523,6 +2591,7 @@ next:  			path->slots[0]++;  		}  		btrfs_release_path(path); +skip:  		logical += increment;  		physical += map->stripe_len;  		spin_lock(&sctx->stat_lock); @@ -2656,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);  		length = btrfs_dev_extent_length(l, dev_extent); -		if (found_key.offset + length <= start) { -			key.offset = found_key.offset + length; -			btrfs_release_path(path); -			continue; -		} +		if (found_key.offset + length <= start) +			goto skip;  		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);  		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2671,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		 * the chunk from going away while we scrub it  		 */  		cache = btrfs_lookup_block_group(fs_info, chunk_offset); -		if (!cache) { -			ret = -ENOENT; -			break; -		} + +		/* some chunks are removed but not committed to disk yet, +		 * continue scrubbing */ +		if (!cache) +			goto skip; +  		dev_replace->cursor_right = found_key.offset + length;  		dev_replace->cursor_left = found_key.offset;  		dev_replace->item_needs_writeback = 1; @@ -2700,25 +2768,24 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		wait_event(sctx->list_wait,  			   atomic_read(&sctx->bios_in_flight) == 0); -		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);  		atomic_inc(&fs_info->scrubs_paused);  		wake_up(&fs_info->scrub_pause_wait); + +		/* +		 * must be called before we decrease @scrub_paused. +		 * make sure we don't block transaction commit while +		 * we are waiting pending workers finished. +		 */  		wait_event(sctx->list_wait,  			   atomic_read(&sctx->workers_pending) == 0); +		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);  		mutex_lock(&fs_info->scrub_lock); -		while (atomic_read(&fs_info->scrub_pause_req)) { -			mutex_unlock(&fs_info->scrub_lock); -			wait_event(fs_info->scrub_pause_wait, -			   atomic_read(&fs_info->scrub_pause_req) == 0); -			mutex_lock(&fs_info->scrub_lock); -		} +		__scrub_blocked_if_needed(fs_info);  		atomic_dec(&fs_info->scrubs_paused);  		mutex_unlock(&fs_info->scrub_lock);  		wake_up(&fs_info->scrub_pause_wait); -		dev_replace->cursor_left = dev_replace->cursor_right; -		dev_replace->item_needs_writeback = 1;  		btrfs_put_block_group(cache);  		if (ret)  			break; @@ -2732,6 +2799,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  			break;  		} +		dev_replace->cursor_left = dev_replace->cursor_right; +		dev_replace->item_needs_writeback = 1; +skip:  		key.offset = found_key.offset + length;  		btrfs_release_path(path);  	} @@ -2782,52 +2852,49 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,  						int is_dev_replace)  {  	int ret = 0; +	int flags = WQ_FREEZABLE | WQ_UNBOUND; +	int max_active = fs_info->thread_pool_size; -	mutex_lock(&fs_info->scrub_lock);  	if (fs_info->scrub_workers_refcnt == 0) {  		if (is_dev_replace) -			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, -					&fs_info->generic_worker); +			fs_info->scrub_workers = +				btrfs_alloc_workqueue("btrfs-scrub", flags, +						      1, 4);  		else -			btrfs_init_workers(&fs_info->scrub_workers, "scrub", -					fs_info->thread_pool_size, -					&fs_info->generic_worker); -		fs_info->scrub_workers.idle_thresh = 4; -		ret = btrfs_start_workers(&fs_info->scrub_workers); -		if (ret) +			fs_info->scrub_workers = +				btrfs_alloc_workqueue("btrfs-scrub", flags, +						      max_active, 4); +		if (!fs_info->scrub_workers) { +			ret = -ENOMEM;  			goto out; -		btrfs_init_workers(&fs_info->scrub_wr_completion_workers, -				   "scrubwrc", -				   fs_info->thread_pool_size, -				   &fs_info->generic_worker); -		fs_info->scrub_wr_completion_workers.idle_thresh = 2; -		ret = btrfs_start_workers( -				&fs_info->scrub_wr_completion_workers); -		if (ret) +		} +		fs_info->scrub_wr_completion_workers = +			btrfs_alloc_workqueue("btrfs-scrubwrc", flags, +					      max_active, 2); +		if (!fs_info->scrub_wr_completion_workers) { +			ret = -ENOMEM;  			goto out; -		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, -				   &fs_info->generic_worker); -		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); -		if (ret) +		} +		fs_info->scrub_nocow_workers = +			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); +		if (!fs_info->scrub_nocow_workers) { +			ret = -ENOMEM;  			goto out; +		}  	}  	++fs_info->scrub_workers_refcnt;  out: -	mutex_unlock(&fs_info->scrub_lock); -  	return ret;  }  static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)  { -	mutex_lock(&fs_info->scrub_lock);  	if (--fs_info->scrub_workers_refcnt == 0) { -		btrfs_stop_workers(&fs_info->scrub_workers); -		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); -		btrfs_stop_workers(&fs_info->scrub_nocow_workers); +		btrfs_destroy_workqueue(fs_info->scrub_workers); +		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);  	}  	WARN_ON(fs_info->scrub_workers_refcnt < 0); -	mutex_unlock(&fs_info->scrub_lock);  }  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -2845,8 +2912,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	 * check some assumptions  	 */  	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { -		printk(KERN_ERR -		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", +		btrfs_err(fs_info, +			   "scrub: size assumption nodesize == leafsize (%d == %d) fails",  		       fs_info->chunk_root->nodesize,  		       fs_info->chunk_root->leafsize);  		return -EINVAL; @@ -2858,16 +2925,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		 * the way scrub is implemented. Do not handle this  		 * situation at all because it won't ever happen.  		 */ -		printk(KERN_ERR -		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", +		btrfs_err(fs_info, +			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",  		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);  		return -EINVAL;  	}  	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {  		/* not supported for data w/o checksums */ -		printk(KERN_ERR -		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", +		btrfs_err(fs_info, +			   "scrub: size assumption sectorsize != PAGE_SIZE " +			   "(%d != %lu) fails",  		       fs_info->chunk_root->sectorsize, PAGE_SIZE);  		return -EINVAL;  	} @@ -2880,7 +2948,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		 * would exhaust the array bounds of pagev member in  		 * struct scrub_block  		 */ -		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", +		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " +			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",  		       fs_info->chunk_root->nodesize,  		       SCRUB_MAX_PAGES_PER_BLOCK,  		       fs_info->chunk_root->sectorsize, @@ -2888,23 +2957,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		return -EINVAL;  	} -	ret = scrub_workers_get(fs_info, is_dev_replace); -	if (ret) -		return ret;  	mutex_lock(&fs_info->fs_devices->device_list_mutex);  	dev = btrfs_find_device(fs_info, devid, NULL, NULL);  	if (!dev || (dev->missing && !is_dev_replace)) {  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(fs_info);  		return -ENODEV;  	} -	mutex_lock(&fs_info->scrub_lock); +	mutex_lock(&fs_info->scrub_lock);  	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {  		mutex_unlock(&fs_info->scrub_lock);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(fs_info);  		return -EIO;  	} @@ -2915,10 +2979,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		btrfs_dev_replace_unlock(&fs_info->dev_replace);  		mutex_unlock(&fs_info->scrub_lock);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(fs_info);  		return -EINPROGRESS;  	}  	btrfs_dev_replace_unlock(&fs_info->dev_replace); + +	ret = scrub_workers_get(fs_info, is_dev_replace); +	if (ret) { +		mutex_unlock(&fs_info->scrub_lock); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		return ret; +	} +  	sctx = scrub_setup_ctx(dev, is_dev_replace);  	if (IS_ERR(sctx)) {  		mutex_unlock(&fs_info->scrub_lock); @@ -2928,15 +2999,24 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	}  	sctx->readonly = readonly;  	dev->scrub_device = sctx; +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); +	/* +	 * checking @scrub_pause_req here, we can avoid +	 * race between committing transaction and scrubbing. +	 */ +	__scrub_blocked_if_needed(fs_info);  	atomic_inc(&fs_info->scrubs_running);  	mutex_unlock(&fs_info->scrub_lock); -	mutex_unlock(&fs_info->fs_devices->device_list_mutex);  	if (!is_dev_replace) { -		down_read(&fs_info->scrub_super_lock); +		/* +		 * by holding device list mutex, we can +		 * kick off writing super in log tree sync. +		 */ +		mutex_lock(&fs_info->fs_devices->device_list_mutex);  		ret = scrub_supers(sctx, dev); -		up_read(&fs_info->scrub_super_lock); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex);  	}  	if (!ret) @@ -2954,10 +3034,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	mutex_lock(&fs_info->scrub_lock);  	dev->scrub_device = NULL; +	scrub_workers_put(fs_info);  	mutex_unlock(&fs_info->scrub_lock);  	scrub_free_ctx(sctx); -	scrub_workers_put(fs_info);  	return ret;  } @@ -2987,16 +3067,6 @@ void btrfs_scrub_continue(struct btrfs_root *root)  	wake_up(&fs_info->scrub_pause_wait);  } -void btrfs_scrub_pause_super(struct btrfs_root *root) -{ -	down_write(&root->fs_info->scrub_super_lock); -} - -void btrfs_scrub_continue_super(struct btrfs_root *root) -{ -	up_write(&root->fs_info->scrub_super_lock); -} -  int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)  {  	mutex_lock(&fs_info->scrub_lock); @@ -3133,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,  	nocow_ctx->len = len;  	nocow_ctx->mirror_num = mirror_num;  	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; -	nocow_ctx->work.func = copy_nocow_pages_worker; +	btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);  	INIT_LIST_HEAD(&nocow_ctx->inodes); -	btrfs_queue_worker(&fs_info->scrub_nocow_workers, -			   &nocow_ctx->work); +	btrfs_queue_work(fs_info->scrub_nocow_workers, +			 &nocow_ctx->work);  	return 0;  } @@ -3195,7 +3265,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)  	ret = iterate_inodes_from_logical(logical, fs_info, path,  					  record_inode_for_nocow, nocow_ctx);  	if (ret != 0 && ret != -ENOENT) { -		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", +		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " +			"phys %llu, len %llu, mir %u, ret %d",  			logical, physical_for_dev_replace, len, mirror_num,  			ret);  		not_written = 1; @@ -3317,7 +3388,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,  again:  		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);  		if (!page) { -			pr_err("find_or_create_page() failed\n"); +			btrfs_err(fs_info, "find_or_create_page() failed");  			ret = -ENOMEM;  			goto out;  		} @@ -3383,14 +3454,13 @@ static int write_page_nocow(struct scrub_ctx *sctx,  	struct bio *bio;  	struct btrfs_device *dev;  	int ret; -	DECLARE_COMPLETION_ONSTACK(compl);  	dev = sctx->wr_ctx.tgtdev;  	if (!dev)  		return -EIO;  	if (!dev->bdev) {  		printk_ratelimited(KERN_WARNING -			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); +			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");  		return -EIO;  	}  	bio = btrfs_io_bio_alloc(GFP_NOFS, 1); @@ -3400,10 +3470,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,  		spin_unlock(&sctx->stat_lock);  		return -ENOMEM;  	} -	bio->bi_private = &compl; -	bio->bi_end_io = scrub_complete_bio_end_io; -	bio->bi_size = 0; -	bio->bi_sector = physical_for_dev_replace >> 9; +	bio->bi_iter.bi_size = 0; +	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;  	bio->bi_bdev = dev->bdev;  	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);  	if (ret != PAGE_CACHE_SIZE) { @@ -3412,10 +3480,8 @@ leave_with_eio:  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);  		return -EIO;  	} -	btrfsic_submit_bio(WRITE_SYNC, bio); -	wait_for_completion(&compl); -	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) +	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))  		goto leave_with_eio;  	bio_put(bio);  | 
