diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/backref.c | 2 | ||||
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 4 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 17 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 8 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 58 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 189 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 295 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 60 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 82 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 28 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 92 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 17 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 2 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 71 | ||||
-rw-r--r-- | fs/btrfs/super.c | 93 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 12 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 7 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 6 |
19 files changed, 678 insertions, 367 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad3929..22c64fff1bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->val[i] = (u64)fspath; + ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a393..634608d2a6d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e4ea3..dede441bdee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59ff929..50634abef9b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { @@ -1271,6 +1272,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { @@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3a1b939c9ae..5b163572e0c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ if (!ret) node->bytes_reserved = num_bytes; + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + return ret; } @@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 102c176fc29..632f8f3cc9d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); @@ -2465,21 +2460,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; @@ -2579,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2640,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2672,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9879bd47463..2ad813674d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root) && btrfs_test_opt(root, SPACE_CACHE)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + int btrfs_block_rsv_add_noflush(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; + return __block_rsv_add(root, block_rsv, num_bytes, 0); } int btrfs_block_rsv_check(struct btrfs_root *root, @@ -3851,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) +static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3872,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3881,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); +} + +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); +} + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4064,23 +4115,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - return 0; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents; + return dropped_extents + drop_inode_space; } /** @@ -4166,9 +4224,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; BTRFS_I(inode)->reserved_extents += nr_extents; + } - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); @@ -5040,11 +5107,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group_cache *used_block_group; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; int index = 0; int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? @@ -5106,6 +5173,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); + used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -5143,6 +5211,7 @@ search: u64 offset; int cached; + used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -5166,13 +5235,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5194,7 +5265,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5204,94 +5274,80 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < - num_bytes + empty_size) { + num_bytes + empty_cluster + empty_size) { spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { /* * the refill lock keeps out other * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + used_block_group = last_ptr->block_group; + if (used_block_group != block_group && + (!used_block_group || + used_block_group->ro || + !block_group_bits(used_block_group, data))) { + used_block_group = block_group; goto refill_cluster; } - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + if (used_block_group != block_group) + btrfs_get_block_group(used_block_group); + + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, num_bytes, used_block_group->key.objectid); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group && - index <= - get_block_group_index(last_ptr->block_group)) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); - spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + WARN_ON(last_ptr->block_group != used_block_group); + if (used_block_group != block_group) { + btrfs_put_block_group(used_block_group); + used_block_group = block_group; } - spin_unlock(&last_ptr->lock); refill_cluster: + BUG_ON(used_block_group != block_group); + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* * this cluster didn't work out, free it and * start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, - offset, num_bytes, + search_start, num_bytes, empty_cluster + empty_size); if (ret == 0) { /* @@ -5327,6 +5383,7 @@ refill_cluster: goto loop; } +unclustered_alloc: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5353,14 +5410,14 @@ checks: search_start = stripe_align(root, offset); /* move on to the next group */ if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + used_block_group->key.objectid + used_block_group->key.offset) { + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5368,14 +5425,14 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, + ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5384,15 +5441,19 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d0e7a..49f3c9dc09f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -935,8 +935,10 @@ again: node = tree_search(tree, start); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); @@ -992,8 +994,10 @@ hit_next: */ if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); prealloc = NULL; @@ -1024,8 +1028,10 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } /* * Avoid to free 'prealloc' if it can be merged with @@ -1051,8 +1057,10 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); @@ -2285,16 +2293,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) clean_io_failure(start, page); } if (!uptodate) { - u64 failed_mirror; - failed_mirror = (u64)bio->bi_bdev; - if (tree->ops && tree->ops->readpage_io_failed_hook) - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - else - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2302,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { @@ -3366,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -3413,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0e23b..7604c300132 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, u64 failed_mirror, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7a15fcfb3e1..ec23d43d0c3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, } } + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + return 0; } @@ -537,6 +542,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); @@ -550,10 +562,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, io_ctl_unmap_page(io_ctl); - if (io_ctl->index >= io_ctl->num_pages) - return 0; - - return io_ctl_check_crc(io_ctl, io_ctl->index); + return 0; } static int io_ctl_read_bitmap(struct io_ctl *io_ctl, @@ -561,9 +570,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, { int ret; - if (io_ctl->cur && io_ctl->cur != io_ctl->orig) - io_ctl_unmap_page(io_ctl); - ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; @@ -699,6 +705,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, num_entries--; } + io_ctl_unmap_page(&io_ctl); + /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. @@ -1462,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1841,7 +1850,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } @@ -2305,6 +2320,7 @@ again: if (!found) { start = i; + cluster->max_size = 0; found = true; } @@ -2448,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2468,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2517,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2557,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53dcbdf446c..f8962a957d6 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -477,11 +494,14 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 966ddcc4c63..2c984f7d4c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; @@ -2199,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2426,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2434,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2477,6 +2467,43 @@ failed: } /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory @@ -3463,7 +3490,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size); + ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); /* * Try and steal from the global reserve since we will @@ -5632,7 +5659,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5670,7 +5697,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, @@ -6529,14 +6556,16 @@ end_trans: ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); @@ -6605,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -6764,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c472f12..72d461656f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { @@ -1278,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } @@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, if (ret < 0) goto out; - ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); if (ret) ret = -EFAULT; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 24d654ce7a0..dff29d5e151 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed11d3866af..c27bcb67f33 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) btrfs_release_path(swarn->path); ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } ret = paths_from_inode(inum, ipath); if (ret < 0) @@ -272,7 +277,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - (char *)ipath->fspath->val[i]); + (char *)(unsigned long)ipath->fspath->val[i]); free_ipath(ipath); return 0; @@ -944,50 +949,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -995,6 +968,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -1015,12 +990,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -1030,6 +1015,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 57080dffdfc..e28ad4baf48 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -197,7 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "no_space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: @@ -710,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); else - seq_puts(seq, ",no_space_cache"); + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -824,13 +825,9 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { - struct super_block *s; struct dentry *root; struct vfsmount *mnt; - struct mnt_namespace *ns_private; char *newargs; - struct path path; - int error; newargs = setup_root_args(data); if (!newargs) @@ -841,39 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (IS_ERR(mnt)) return ERR_CAST(mnt); - ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) { - mntput(mnt); - return ERR_CAST(ns_private); - } - - /* - * This will trigger the automount of the subvol so we can just - * drop the mnt we have here and return the dentry that we - * found. - */ - error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, - LOOKUP_FOLLOW, &path); - put_mnt_ns(ns_private); - if (error) - return ERR_PTR(error); + root = mount_subtree(mnt, subvol_name); - if (!is_subvolume_inode(path.dentry->d_inode)) { - path_put(&path); - mntput(mnt); - error = -EINVAL; + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); - return ERR_PTR(-EINVAL); } - /* Get a ref to the sb and the dentry we found and return it */ - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - root = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); - return root; } @@ -890,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -904,8 +878,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } if (subvol_name) { root = mount_subvol(subvol_name, flags, device_name, data); @@ -917,15 +893,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) return ERR_PTR(error); - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - return ERR_PTR(error); - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need @@ -933,24 +900,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -959,12 +938,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + error = -EBUSY; + goto error_close_devices; } btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -991,8 +970,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); +error_fs_info: free_fs_info(fs_info); - kfree(tree_root); return ERR_PTR(error); } @@ -1078,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; - nr_devices = fs_info->fs_devices->rw_devices; + nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); devices_info = kmalloc(sizeof(*devices_info) * nr_devices, @@ -1100,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - if (!device->in_fs_metadata) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev) continue; avail_space = device->total_bytes - device->bytes_used; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 960835eaf4d..81376d94cd3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -882,8 +886,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f8e2943101a..0a8c8f8304b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1608,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c49f35..78f2d4d4f37 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device { struct reada_zone *reada_curr_zone; struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { |