diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 14:48:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 14:48:55 -0700 |
commit | e2aed8dfa50bb061747eeb14e6af099554a03b76 (patch) | |
tree | 900c96a2dfe7195e56ec3c1f027418029d0a8444 /fs/btrfs/extent-tree.c | |
parent | 476525004ac7e2f990b6956efcd44d0780c2ab4c (diff) | |
parent | b24baf6917a376420d535548e1f88744028bcf24 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull large btrfs update from Chris Mason:
"This pull request is very large, and the two main features in here
have been under testing/devel for quite a while.
We have subvolume quotas from the strato developers. This enables
full tracking of how many blocks are allocated to each subvolume (and
all snapshots) and you can set limits on a per-subvolume basis. You
can also create quota groups and toss multiple subvolumes into a big
group. It's everything you need to be a web hosting company and give
each user their own subvolume.
The userland side of the quotas is being refreshed, they'll send out
details on where to grab it soon.
Next is the kernel side of btrfs send/receive from Alexander Block.
This leverages the same infrastructure as the quota code to figure out
relationships between blocks and their owners. It can then compute
the difference between two snapshots and sends the diffs in a neutral
format into userland.
The basic model:
create a snapshot
send that snapshot as the initial backup
make changes
create a second snapshot
send the incremental as a backup
delete the first snapshot
(use the second snapshot for the next incremental)
The receive portion is all in userland, and in the 'next' branch of my
btrfs-progs repo.
There's still some work to do in terms of optimizing the send side
from kernel to userland. The really important part is figuring out
how two snapshots are different, and this is where we are
concentrating right now. The initial send of a dataset is a little
slower than tar, but the incremental sends are dramatically faster
than what rsync can do.
On top of all of that, we have a nice queue of fixes, cleanups and
optimizations."
Fix up trivial modify/del conflict in fs/btrfs/ioctl.c
Also fix up semantic conflict in fs/btrfs/send.c: the interface to
dentry_open() changed in commit 765927b2d508 ("switch dentry_open() to
struct path, make it grab references itself"), and since it now grabs
whatever references it needs, we should no longer do the mntget() on the
mnt (and we need to dput() the dentry reference we took).
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (65 commits)
Btrfs: uninit variable fixes in send/receive
Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive
Btrfs: add btrfs_compare_trees function
Btrfs: introduce subvol uuids and times
Btrfs: make iref_to_path non static
Btrfs: add a barrier before a waitqueue_active check
Btrfs: call the ordered free operation without any locks held
Btrfs: Check INCOMPAT flags on remount and add helper function
Btrfs: add helper for tree enumeration
btrfs: allow cross-subvolume file clone
Btrfs: improve multi-thread buffer read
Btrfs: make btrfs's allocation smoothly with preallocation
Btrfs: lock the transition from dirty to writeback for an eb
Btrfs: fix potential race in extent buffer freeing
Btrfs: don't return true in releasepage unless we actually freed the eb
Btrfs: suppress printk() if all device I/O stats are zero
Btrfs: remove unwanted printk() for btrfs device I/O stats
Btrfs: rewrite BTRFS_SETGET_FUNCS
Btrfs: zero unused bytes in inode item
Btrfs: kill free_space pointer from inode structure
...
Conflicts:
fs/btrfs/ioctl.c
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 358 |
1 files changed, 240 insertions, 118 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff..4e1b153b7c4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "locking.h" #include "free-space-cache.h" +#undef SCRAMBLE_DELAYED_REFS + /* * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk @@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int count = 0; int must_insert_reserved = 0; @@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { /* * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. @@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, root->fs_info->extent_root, + do_chunk_alloc(trans, fs_info->extent_root, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); @@ -2347,21 +2350,99 @@ next: return count; } -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, +static void wait_for_more_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, unsigned long num_refs, struct list_head *first_seq) { spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); - wait_event(delayed_refs->seq_wait, + wait_event(fs_info->tree_mod_seq_wait, num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); + fs_info->tree_mod_seq_list.next != first_seq); pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); + delayed_refs->num_entries, fs_info->tree_mod_seq_list.next); spin_lock(&delayed_refs->lock); } +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct qgroup_update *qgroup_update; + int ret = 0; + + if (list_empty(&trans->qgroup_ref_list) != + !trans->delayed_ref_elem.seq) { + /* list without seq or seq without list */ + printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); + } + + if (!trans->delayed_ref_elem.seq) + return 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + qgroup_update = list_first_entry(&trans->qgroup_ref_list, + struct qgroup_update, list); + list_del(&qgroup_update->list); + if (!ret) + ret = btrfs_qgroup_account_ref( + trans, fs_info, qgroup_update->node, + qgroup_update->extent_op); + kfree(qgroup_update); + } + + btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: consider_waiting = 0; spin_lock(&delayed_refs->lock); + +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; @@ -2437,7 +2525,7 @@ again: num_refs = delayed_refs->num_entries; first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, + wait_for_more_refs(root->fs_info, delayed_refs, num_refs, first_seq); /* * after waiting, things have changed. we @@ -2502,6 +2590,7 @@ again: } out: spin_unlock(&delayed_refs->lock); + assert_qgroups_uptodate(trans); return 0; } @@ -2581,8 +2670,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, node = rb_prev(node); if (node) { + int seq = ref->seq; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) + if (ref->bytenr == bytenr && ref->seq == seq) goto out_unlock; } @@ -2903,8 +2994,13 @@ again: } spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; @@ -3134,6 +3230,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; return 0; } @@ -3263,12 +3361,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return get_alloc_profile(root, flags); } -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); -} - /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. @@ -3277,6 +3369,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0, committed = 0, alloc_chunk = 1; @@ -3289,7 +3382,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) committed = 1; } - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = fs_info->data_sinfo; if (!data_sinfo) goto alloc; @@ -3330,10 +3423,9 @@ alloc: goto commit_trans; } - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + goto again; } @@ -3380,7 +3472,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", @@ -3586,89 +3678,58 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; - u64 reserved; + u64 delalloc_bytes; u64 max_reclaim; - u64 reclaimed = 0; long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; - unsigned long progress; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { + delalloc_bytes = root->fs_info->delalloc_bytes; + if (delalloc_bytes == 0) { if (trans) - return 0; + return; btrfs_wait_ordered_extents(root, 0, 0); - return 0; + return; } - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; + if (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use + orig <= + space_info->total_bytes) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - if (wait_ordered && !trans) { btrfs_wait_ordered_extents(root, 0, 0); } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ + time_left = schedule_timeout_killable(1); if (time_left) break; } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - + smp_mb(); + delalloc_bytes = root->fs_info->delalloc_bytes; } - - return reclaimed >= to_reclaim; } /** @@ -3728,6 +3789,58 @@ commit: return btrfs_commit_transaction(trans, root); } +enum flush_state { + FLUSH_DELALLOC = 1, + FLUSH_DELALLOC_WAIT = 2, + FLUSH_DELAYED_ITEMS_NR = 3, + FLUSH_DELAYED_ITEMS = 4, + COMMIT_TRANS = 5, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) { + u64 bytes = btrfs_calc_trans_metadata_size(root, 1); + + nr = (int)div64_u64(num_bytes, bytes); + if (!nr) + nr = 1; + nr *= 2; + } else { + nr = -1; + } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -3749,11 +3862,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int retries = 0; + int flush_state = FLUSH_DELALLOC; int ret = 0; - bool committed = false; bool flushing = false; - bool wait_ordered = false; + bool committed = false; again: ret = 0; @@ -3812,9 +3924,8 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - wait_ordered = true; num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); + (orig_bytes * 2); } if (ret) { @@ -3867,8 +3978,6 @@ again: trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else { - wait_ordered = true; } } @@ -3887,36 +3996,13 @@ again: if (!ret || !flush) goto out; - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) - goto out; - - ret = 0; - - /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. - */ - if (retries < 2) { - wait_ordered = true; - retries++; + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; + if (!ret) goto again; - } - - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; + else if (flush_state <= COMMIT_TRANS) goto again; - } out: if (flushing) { @@ -3934,7 +4020,10 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows || root == root->fs_info->csum_root) + if (root->ref_cows) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; if (!block_rsv) @@ -4286,6 +4375,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + if (!trans->block_rsv) + return; + if (!trans->bytes_reserved) return; @@ -4444,7 +4536,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(inode)) flush = 0; if (flush && btrfs_transaction_in_commit(root->fs_info)) @@ -4476,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + return ret; + } + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; @@ -4554,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -5190,8 +5294,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + smp_mb(); + if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) + wake_up(&root->fs_info->tree_mod_seq_wait); /* * we don't take a ref on the node because we're removing it from the @@ -5748,7 +5853,11 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); - if (ret < 0) { + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -5816,13 +5925,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", + printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", (unsigned long long)cache->key.objectid, (unsigned long long)cache->key.offset, (unsigned long long)btrfs_block_group_used(&cache->item), (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + (unsigned long long)cache->reserved, + cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -7610,8 +7719,21 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - if (need_clear) + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), |