diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 4214 |
1 files changed, 2808 insertions, 1406 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 283af7a676a..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -24,15 +24,20 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> -#include "compat.h" +#include <linux/percpu_counter.h> #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" +#include "sysfs.h" +#include "qgroup.h" + +#undef SCRAMBLE_DELAYED_REFS /* * control flags for do_chunk_alloc's force field @@ -69,15 +74,15 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -90,22 +95,29 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); + struct btrfs_root *extent_root, u64 flags, + int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 num_bytes, int reserve, + int delalloc); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) { smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; } static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -159,6 +171,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -200,8 +216,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -245,7 +264,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -253,13 +273,35 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); + if (ret) + return ret; while (nr--) { - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, logical[nr], - stripe_len); - BUG_ON(ret); + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = add_excluded_extent(root, start, len); + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -310,7 +352,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, while (start < end) { ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); if (ret) break; @@ -321,7 +364,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { break; @@ -332,7 +375,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, size = end - start; total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ } return total_added; @@ -350,7 +393,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = 0; + int ret = -ENOMEM; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -379,8 +422,9 @@ static noinline void caching_thread(struct btrfs_work *work) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); +next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -402,19 +446,35 @@ again: break; if (need_resched() || - btrfs_next_leaf(extent_root, path)) { + rwsem_is_contended(&fs_info->commit_root_sem)) { caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto err; + if (ret) + break; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); continue; } + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + if (key.objectid < block_group->key.objectid) { path->slots[0]++; continue; @@ -424,11 +484,16 @@ again: block_group->key.offset) break; - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); - last = key.objectid + key.offset; + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->tree_root->leafsize; + else + last = key.objectid + key.offset; if (total_found > (1024 * 1024 * 2)) { total_found = 0; @@ -451,12 +516,18 @@ again: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -464,8 +535,6 @@ out: } static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, int load_cache_only) { DEFINE_WAIT(wait); @@ -474,7 +543,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); + if (!caching_ctl) + return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); @@ -482,7 +552,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -522,15 +592,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ - if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root) && - btrfs_test_opt(root, SPACE_CACHE)) { + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -574,14 +636,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -647,73 +709,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner) -{ - struct btrfs_block_group_cache *cache; - u64 used; - u64 last = max(search_hint, search_start); - u64 group_start = 0; - int full_search = 0; - int factor = 9; - int wrapped = 0; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache) - break; - - spin_lock(&cache->lock); - last = cache->key.objectid + cache->key.offset; - used = btrfs_block_group_used(&cache->item); - - if ((full_search || !cache->ro) && - block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { - if (used + cache->pinned + cache->reserved < - div_factor(cache->key.offset, factor)) { - group_start = cache->key.objectid; - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - goto found; - } - } - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - cond_resched(); - } - if (!wrapped) { - last = search_start; - wrapped = 1; - goto again; - } - if (!full_search && factor < 10) { - last = search_start; - full_search = 1; - factor = 10; - goto again; - } -found: - return group_start; -} - /* simple helper to search for an existing extent at a given offset */ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) { @@ -727,15 +722,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.objectid = start; key.offset = len; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); + if (ret > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == start && + key.type == BTRFS_METADATA_ITEM_KEY) + ret = 0; + } btrfs_free_path(path); return ret; } /* - * helper function to lookup reference count and flags of extent. + * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head @@ -745,7 +746,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; @@ -758,23 +759,57 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 extent_flags; int ret; + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { + offset = root->leafsize; + metadata = 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->leafsize) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } + } + if (ret == 0) { leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -823,14 +858,16 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -982,7 +1019,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; - BUG_ON(ret > 0); + BUG_ON(ret > 0); /* Corruption */ leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, @@ -1008,9 +1045,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, new_size + extra_size, 1); if (ret < 0) return ret; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ - ret = btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1040,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -1237,7 +1274,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1273,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1462,6 +1500,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int want; int ret; int err = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1473,12 +1513,54 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, path->keep_locks = 1; } else extra_size = -1; + + /* + * Owner is our parent level, so we can just add one to get the level + * for the block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) { err = ret; goto out; } - BUG_ON(ret); + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + + if (ret && !insert) { + err = -ENOENT; + goto out; + } else if (WARN_ON(ret)) { + err = -EIO; + goto out; + } leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1506,11 +1588,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } err = -ENOENT; @@ -1592,13 +1672,12 @@ out: * helper to add new inline back ref */ static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1608,7 +1687,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 refs; int size; int type; - int ret; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1617,7 +1695,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - ret = btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1652,7 +1730,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } btrfs_mark_buffer_dirty(leaf); - return 0; } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1687,12 +1764,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1703,7 +1780,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, u32 item_size; int size; int type; - int ret; u64 refs; leaf = path->nodes[0]; @@ -1737,6 +1813,7 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1745,10 +1822,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); - return 0; } static noinline_for_stack @@ -1768,13 +1844,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); + update_inline_extent_backref(root, path, iref, + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); + setup_inline_extent_backref(root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } return ret; } @@ -1802,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { - int ret; + int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); + update_inline_extent_backref(root, path, iref, + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; @@ -1833,8 +1911,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ if (!ret) { struct btrfs_bio_stripe *stripe = bbio->stripes; int i; @@ -1850,7 +1929,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, if (!ret) discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) - break; + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1866,13 +1945,17 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, *actual_bytes = discarded_bytes; + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } +/* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1884,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1899,37 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; - int err = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret == 0) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); - if (ret != -EAGAIN) { - err = ret; + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); goto out; } + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -1937,17 +2047,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -1968,16 +2085,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); + if (extent_op) flags |= extent_op->flags_to_set; - } ret = alloc_reserved_file_extent(trans, root, parent, ref_root, flags, ref->objectid, ref->offset, @@ -1987,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -2030,15 +2146,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; + int metadata = !extent_op->is_data; + + if (trans->aborted) + return 0; + + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + metadata = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + } + +again: path->reada = 1; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, @@ -2048,8 +2178,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, goto out; } if (ret > 0) { - err = -EIO; - goto out; + if (metadata) { + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; + + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; + } } leaf = path->nodes[0]; @@ -2087,34 +2238,44 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; + + ins.objectid = node->bytenr; + if (skinny_metadata) { + ins.offset = ref->level; + ins.type = BTRFS_METADATA_ITEM_KEY; + } else { + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + } BUG_ON(node->ref_mod != 1); if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, root, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } @@ -2128,7 +2289,15 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) { - int ret; + int ret = 0; + + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); + return 0; + } + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* @@ -2139,6 +2308,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); + trace_run_delayed_ref_head(node, head, node->action); + if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, node->num_bytes, 1); @@ -2146,11 +2317,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); - BUG_ON(ret); } } - mutex_unlock(&head->mutex); - return 0; + return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -2170,59 +2339,62 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; + else if (last == NULL) + last = ref; + node = rb_next(node); } - return NULL; + return last; } -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2237,24 +2409,33 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); + + /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates */ ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - locked_ref = NULL; + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { + spin_unlock(&locked_ref->lock); + btrfs_delayed_ref_unlock(locked_ref); + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); + count++; continue; } @@ -2269,6 +2450,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2276,69 +2459,308 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); - BUG_ON(ret); - kfree(extent_op); - - goto next; + btrfs_free_delayed_extent_op(extent_op); + + if (ret) { + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + btrfs_delayed_ref_unlock(locked_ref); + return ret; + } + continue; } - list_del_init(&locked_ref->cluster); - locked_ref = NULL; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; + } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - /* - * we modified num_entries, but as we're currently running - * delayed refs, skip - * wake_up(&delayed_refs->seq_wait); - * here. - */ - spin_unlock(&delayed_refs->lock); + if (!btrfs_delayed_ref_is_head(ref)) { + /* + * when we play the delayed ref, also correct the + * ref_mod on head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->node.ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->node.ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + } + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - BUG_ON(ret); + btrfs_free_delayed_extent_op(extent_op); + if (ret) { + locked_ref->processing = 0; + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); + return ret; + } + + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } btrfs_put_delayed_ref(ref); - kfree(extent_op); count++; -next: - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); } - return count; + return 0; } +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs) + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct list_head *first_seq = delayed_refs->seq_head.next; + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 num_bytes; + int ret = 0; - spin_unlock(&delayed_refs->lock); - pr_debug("waiting for more refs (num %ld, first %p)\n", - num_refs, first_seq); - wait_event(delayed_refs->seq_wait, - num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); - pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); - spin_lock(&delayed_refs->lock); + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->leafsize; + num_bytes <<= 1; + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) + num_bytes <<= 1; + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; } /* @@ -2347,104 +2769,62 @@ static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, * 0, which means to process everything in the tree at the start * of the run (but not newly added entries), or it can be some target * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - unsigned long num_refs = 0; - int consider_waiting; + + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); -again: - consider_waiting = 0; - spin_lock(&delayed_refs->lock); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - while (1) { - if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (consider_waiting == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked) and if the number of - * refs doesn't change, we avoid busy waiting. - */ - consider_waiting = 1; - num_refs = delayed_refs->num_entries; - } else { - wait_for_more_refs(delayed_refs, num_refs); - /* - * after waiting, things have changed. we - * dropped the lock and someone else might have - * run some refs, built new clusters and so on. - * therefore, we restart staleness detection. - */ - consider_waiting = 0; - } - } - - ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); - - count -= min_t(unsigned long, ret, count); - if (count == 0) - break; - - if (ret || delayed_refs->run_delayed_start == 0) { - /* refs were run, let's reset staleness detection */ - consider_waiting = 0; - } +again: +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - node = rb_first(&delayed_refs->root); - if (!node) + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2458,27 +2838,32 @@ again: btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - spin_unlock(&delayed_refs->lock); + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; + assert_qgroups_uptodate(trans); return 0; } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2486,11 +2871,12 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -2506,12 +2892,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2528,38 +2915,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - data_ref = btrfs_delayed_node_to_data_ref(ref); + data_ref = btrfs_delayed_node_to_data_ref(ref); - node = rb_prev(node); - if (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -2584,7 +2968,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = -ENOENT; if (path->slots[0] == 0) @@ -2673,7 +3057,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -2688,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -2723,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset, no_quota); if (ret) goto fail; } else { @@ -2731,27 +3119,26 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - for_cow); + no_quota); if (ret) goto fail; } } return 0; fail: - BUG(); return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2767,7 +3154,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -2775,8 +3162,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } return 0; } @@ -2860,32 +3249,41 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, path, - inode); + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(root, trans, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE) || + block_group->delalloc_bytes) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } spin_unlock(&block_group->lock); - num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; - /* - * Just to make absolutely sure we have enough space, we're going to - * preallocate 12 pages worth of space for each block group. In - * practice we ought to use at most 8, but we need extra space so we can - * add our header and have a terminator between the extents and the - * bitmaps. - */ num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; @@ -2949,7 +3347,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2976,8 +3375,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); - BUG_ON(err); btrfs_put_block_group(cache); + if (err) /* File system offline */ + goto out; } while (1) { @@ -2989,7 +3389,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -3014,20 +3415,21 @@ again: continue; } - btrfs_write_out_cache(root, trans, cache, path); + err = btrfs_write_out_cache(root, trans, cache, path); /* * If we didn't have an error then the cache state is still * NEED_WRITE, so we can set it to WRITTEN. */ - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) cache->disk_cache_state = BTRFS_DC_WRITTEN; last = cache->key.objectid + cache->key.offset; btrfs_put_block_group(cache); } +out: btrfs_free_path(path); - return 0; + return err; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3043,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3050,6 +3469,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info *found; int i; int factor; + int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3073,6 +3493,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; + ret = percpu_counter_init(&found->total_bytes_pinned, 0); + if (ret) { + kfree(found); + return ret; + } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); @@ -3091,25 +3517,64 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); - return 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; } /* @@ -3119,7 +3584,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * progress (either running or paused) picks the target profile (if it's * already available), otherwise falls back to plain reducing. */ -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* * we add in the count of missing devices because we want @@ -3128,72 +3593,68 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) */ u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + u64 target; + u64 tmp; - /* pick restriper's target profile if it's available */ + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ spin_lock(&root->fs_info->balance_lock); - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && - (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->data.target)) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && - (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->sys.target)) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && - (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->meta.target)) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { spin_unlock(&root->fs_info->balance_lock); - flags = tgt; - goto out; + return extended_to_chunk(target); } } spin_unlock(&root->fs_info->balance_lock); + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } - -out: - /* extended -> chunk profile */ - flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return flags; + return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -3201,6 +3662,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; + u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -3209,13 +3671,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - return get_alloc_profile(root, flags); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); + ret = get_alloc_profile(root, flags); + return ret; } /* @@ -3226,19 +3683,19 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { - alloc_chunk = 0; + if (btrfs_is_free_space_inode(inode)) { committed = 1; + ASSERT(current->journal_info); } - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = fs_info->data_sinfo; if (!data_sinfo) goto alloc; @@ -3263,12 +3720,21 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, alloc_target, CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); @@ -3279,18 +3745,18 @@ alloc: goto commit_trans; } - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + goto again; } /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. + * If we don't have enough pinned space to deal with this + * allocation don't bother committing the transaction. */ - if (data_sinfo->bytes_pinned < bytes) + if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, + bytes) < 0) committed = 1; spin_unlock(&data_sinfo->lock); @@ -3299,6 +3765,7 @@ commit_trans: if (!committed && !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3308,11 +3775,14 @@ commit_trans: goto again; } + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 1); + data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3327,13 +3797,14 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 0); + data_sinfo->flags, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3350,9 +3821,13 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes, - int force) + struct btrfs_space_info *sinfo, int force) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3367,7 +3842,8 @@ static int should_alloc_chunk(struct btrfs_root *root, * and purposes it's used space. Don't worry about locking the * global_rsv, it doesn't change except when the transaction commits. */ - num_allocated += global_rsv->size; + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += calc_global_rsv_need_space(global_rsv); /* * in limited mode, we want to have some free space up to @@ -3381,48 +3857,92 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) return 0; return 1; } +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; + else + num_dev = 1; /* DUP or single */ + + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) + struct btrfs_root *extent_root, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; int wait_for_alloc = 0; int ret = 0; - BUG_ON(!profile_is_valid(flags, 0)); + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, 0, 0, &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } - BUG_ON(!space_info); + BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; spin_unlock(&space_info->lock); - return 0; + return ret; } - if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { + if (!should_alloc_chunk(extent_root, space_info, force)) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { @@ -3447,6 +3967,8 @@ again: goto again; } + trans->allocating_chunk = true; + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3466,110 +3988,203 @@ again: force_metadata_allocation(fs_info); } + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + ret = btrfs_alloc_chunk(trans, extent_root, flags); - if (ret < 0 && ret != -ENOSPC) - goto out; + trans->allocating_chunk = false; spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; if (ret) space_info->full = 1; else ret = 1; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 space_size; + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = root->fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(root->fs_info, nr_items); + } +} + +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ + u64 bytes; + int nr; + + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM (256 * 1024) + /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; - u64 reserved; + u64 delalloc_bytes; u64 max_reclaim; - u64 reclaimed = 0; long time_left; - unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; - int loops = 0; - unsigned long progress; + unsigned long nr_pages; + int loops; + int items; + enum btrfs_reserve_flush_enum flush; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; - smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); + if (delalloc_bytes == 0) { if (trans) - return 0; - btrfs_wait_ordered_extents(root, 0, 0); - return 0; + return; + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); + return; } - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + loops = 0; + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; + if (can_overcommit(root, space_info, orig, flush)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_roots(root->fs_info, items); } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ + time_left = schedule_timeout_killable(1); if (time_left) break; } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } - - return reclaimed >= to_reclaim; } /** @@ -3597,12 +4212,9 @@ static int may_commit_transaction(struct btrfs_root *root, goto commit; /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { - spin_unlock(&space_info->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) goto commit; - } - spin_unlock(&space_info->lock); /* * See if there is some space in the delayed insertion reservation for @@ -3612,7 +4224,8 @@ static int may_commit_transaction(struct btrfs_root *root, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size < bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } @@ -3626,12 +4239,171 @@ commit: return btrfs_commit_transaction(trans, root); } +enum flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes * 2, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want - * @flush - wether or not we can flush to make our reservation + * @flush - whether or not we can flush to make our reservation * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3642,25 +4414,25 @@ commit: */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int retries = 0; + int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool committed = false; bool flushing = false; - bool wait_ordered = false; again: ret = 0; spin_lock(&space_info->lock); /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - while (flush && !flushing && space_info->flush) { + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { spin_unlock(&space_info->lock); /* * If we have a trans handle we can't wait because the flusher @@ -3670,9 +4442,8 @@ again: */ if (current->journal_info) return -EAGAIN; - ret = wait_event_interruptible(space_info->wait, - !space_info->flush); - /* Must have been interrupted, return */ + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ if (ret) return -EINTR; @@ -3695,9 +4466,7 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { /* @@ -3713,115 +4482,77 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - wait_ordered = true; num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); + (orig_bytes * 2); } - if (ret) { - u64 profile = btrfs_get_alloc_profile(root, 0); - u64 avail; - - /* - * If we have a lot of space that's pinned, don't bother doing - * the overcommit dance yet and just commit the transaction. - */ - avail = (space_info->total_bytes - space_info->bytes_used) * 8; - do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !committed) { - space_info->flush = 1; - flushing = true; - spin_unlock(&space_info->lock); - ret = may_commit_transaction(root, space_info, - orig_bytes, 1); - if (ret) - goto out; - committed = true; - goto again; - } - - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. - */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; - - /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. - */ - if (flush) - avail >>= 3; - else - avail >>= 1; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (used + num_bytes < space_info->total_bytes + avail) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); - ret = 0; - } else { - wait_ordered = true; - } + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; } /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. */ - if (ret && flush) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); - if (!ret || !flush) - goto out; - - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; - ret = 0; + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. */ - if (retries < 2) { - wait_ordered = true; - retries++; - goto again; - } + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; + if (!ret) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) goto again; - } out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -3831,12 +4562,19 @@ out: return ret; } -static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows || root == root->fs_info->csum_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->uuid_root) block_rsv = trans->block_rsv; if (!block_rsv) @@ -3875,6 +4613,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -3913,9 +4676,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)space_info, - num_bytes, 0); - space_info->reservation_progress++; + space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); } } @@ -3934,13 +4695,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); + rsv->type = type; } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; @@ -3949,7 +4712,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv); + btrfs_init_block_rsv(block_rsv, type); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); return block_rsv; @@ -3958,13 +4721,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { + if (!rsv) + return; btrfs_block_rsv_release(root, rsv, (u64)-1); kfree(rsv); } -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) { int ret; @@ -3980,20 +4745,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { @@ -4012,9 +4763,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -4042,20 +4793,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4068,7 +4805,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -4118,10 +4855,10 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = calc_global_metadata_size(fs_info); - spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -4132,21 +4869,20 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 1); + sinfo->flags, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 0); - sinfo->reservation_progress++; + sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } - spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } static void init_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4167,6 +4903,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; update_global_block_rsv(fs_info); @@ -4189,15 +4927,19 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + if (!trans->block_rsv) + return; + if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, - trans->bytes_reserved, 0); + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } +/* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -4225,19 +4967,66 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved, + bool use_global_rsv) { - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries - * and one for root of the snapshot. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -4258,10 +5047,9 @@ static unsigned drop_outstanding_extent(struct inode *inode) BTRFS_I(inode)->outstanding_extents--; if (BTRFS_I(inode)->outstanding_extents == 0 && - BTRFS_I(inode)->delalloc_meta_reserved) { + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) drop_inode_space = 1; - BTRFS_I(inode)->delalloc_meta_reserved = 0; - } /* * If we have more or the same amount of outsanding extents than we have @@ -4343,17 +5131,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 csum_bytes; unsigned nr_extents = 0; int extra_reserve = 0; - int flush = 1; - int ret; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) - flush = 0; + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } - if (flush && btrfs_transaction_in_commit(root->fs_info)) + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4368,7 +5168,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * Add an item to reserve for updating the inode when we complete the * delalloc io. */ - if (!BTRFS_I(inode)->delalloc_meta_reserved) { + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { nr_extents++; extra_reserve = 1; } @@ -4378,55 +5179,102 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + goto out_fail; + } - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); if (extra_reserve) { - BTRFS_I(inode)->delalloc_meta_reserved = 1; + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); nr_extents--; } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; } /** @@ -4448,13 +5296,19 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4510,8 +5364,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; @@ -4522,19 +5375,19 @@ static int update_block_group(struct btrfs_trans_handle *trans, int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); + spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) - return -1; + return -ENOENT; if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -4548,7 +5401,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); + cache_block_group(cache, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4598,6 +5451,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -4625,6 +5485,8 @@ static int pin_down_extent(struct btrfs_root *root, set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -4637,7 +5499,7 @@ int btrfs_pin_extent(struct btrfs_root *root, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ pin_down_extent(root, cache, bytenr, num_bytes, reserved); @@ -4648,14 +5510,15 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; + int ret; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + if (!cache) + return -EINVAL; /* * pull in the free space cache (if any) so that our pin @@ -4663,13 +5526,87 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, trans, root, 1); + cache_block_group(cache, 1); pin_down_extent(root, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ - btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); btrfs_put_block_group(cache); + return ret; +} + +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + return 0; } @@ -4678,6 +5615,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space, or by somebody who is * freeing space that was never actually used on disk. For example if you @@ -4696,10 +5634,11 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * succeeds. */ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (reserve != RESERVE_FREE) { @@ -4710,25 +5649,29 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)space_info, - num_bytes, 0); + "space_info", space_info->flags, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } + + if (delalloc) + cache->delalloc_bytes += num_bytes; } } else { if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4736,7 +5679,7 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -4755,25 +5698,28 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); update_global_block_rsv(fs_info); - return 0; } static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ } len = cache->key.objectid + cache->key.offset - start; @@ -4785,15 +5731,31 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } start += len; + space_info = cache->space_info; - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; + space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) @@ -4810,6 +5772,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + if (trans->aborted) + return 0; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else @@ -4817,7 +5782,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -4833,12 +5798,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -4854,6 +5841,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; path = btrfs_alloc_path(); if (!path) @@ -4865,6 +5859,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; BUG_ON(!is_data && refs_to_drop != 1); + if (is_data) + skinny_metadata = 0; + ret = lookup_extent_backref(trans, extent_root, path, &iref, bytenr, num_bytes, parent, root_objectid, owner_objectid, @@ -4881,6 +5878,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -4894,8 +5896,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); - BUG_ON(ret); + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -4903,29 +5908,61 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } extent_slot = path->slots[0]; } - } else { + } else if (WARN_ON(ret == -ENOENT)) { btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); - printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + bytenr, parent, root_objectid, owner_objectid, + owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } else { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } leaf = path->nodes[0]; @@ -4935,7 +5972,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -4947,12 +5987,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -4961,7 +6004,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -4969,10 +6013,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs < refs_to_drop); + if (refs < refs_to_drop) { + btrfs_err(info, "trying to drop %d refs but we only have %Lu " + "for bytenr %Lu", refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -4988,9 +6039,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); - BUG_ON(ret); + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -5004,23 +6060,44 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); - BUG_ON(ret); + ret = update_block_group(root, bytenr, num_bytes, 0); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } +out: btrfs_free_path(path); return ret; } @@ -5036,30 +6113,22 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; - - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + goto out_delayed_unlock; - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -5075,21 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -5100,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } @@ -5107,9 +6177,10 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) + u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; + int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -5117,8 +6188,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); + BTRFS_DROP_DELAYED_REF, NULL, 0); + BUG_ON(ret); /* -ENOMEM */ } if (!last_ref) @@ -5141,9 +6212,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); + pin = 0; } out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + /* * Deleting the buffer, clear the corrupt flag since it doesn't matter * anymore. @@ -5152,13 +6230,20 @@ out: btrfs_put_block_group(cache); } +/* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset, int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. @@ -5172,23 +6257,22 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); - BUG_ON(ret); + NULL, no_quota); } return ret; } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } @@ -5202,111 +6286,198 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) * for our min num_bytes. Another option is to have it go ahead * and look in the rbtree for a free extent of a given size, but this * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. */ -static noinline int +static noinline void wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return; wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); - return 0; } static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); + int ret = 0; caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, block_group_cache_done(cache)); - + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; put_caching_control(caching_ctl); - return 0; + return ret; } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int __get_raid_index(u64 flags) { - int index; - if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; - else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; - else - index = 4; - return index; + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + +int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_raid_index(cache->flags); +} + +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; } enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: - * ins->objectid == block start + * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks + * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 data) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; + u64 search_start = 0; + u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; - int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + int index = __get_raid_index(flags); + int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(orig_root, num_bytes, empty_size, data); + trace_find_free_extent(orig_root, num_bytes, empty_size, flags); - space_info = __find_space_info(root->fs_info, data); + space_info = __find_space_info(root->fs_info, flags); if (!space_info) { - printk(KERN_ERR "No space info for %llu\n", data); + btrfs_err(root->fs_info, "No space info for %llu", flags); return -ENOSPC; } @@ -5317,16 +6488,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; - - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { + if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -5345,10 +6513,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -5356,9 +6522,8 @@ ideal_cache: * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + if (block_group && block_group_bits(block_group, flags) && + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5372,6 +6537,7 @@ ideal_cache: up_read(&space_info->groups_sem); } else { index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -5386,8 +6552,7 @@ search: u64 offset; int cached; - used_block_group = block_group; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; /* @@ -5395,9 +6560,11 @@ search: * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, data)) { + if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; /* @@ -5405,51 +6572,20 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((data & extra) && !(block_group->flags & extra)) + if ((flags & extra) && !(block_group->flags & extra)) goto loop; } have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - u64 free_percent; - - found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 1); - if (block_group->cached == BTRFS_CACHE_FINISHED) - goto alloc; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * The caching workers are limited to 2 threads, so we - * can queue as much work as we care to. - */ - if (loop > LOOP_FIND_IDEAL) { - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); - } - - /* - * If loop is set for cached only, try the next block - * group. - */ - if (loop == LOOP_FIND_IDEAL) - goto loop; + ret = cache_block_group(block_group, 0); + BUG_ON(ret < 0); + ret = 0; } -alloc: + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; if (unlikely(block_group->ro)) goto loop; @@ -5458,40 +6594,44 @@ alloc: * lets look there */ if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, data))) { - used_block_group = block_group; + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - } - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, num_bytes, used_block_group->key.objectid); + last_ptr, + num_bytes, + used_block_group->key.objectid, + &max_extent_size); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { - btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } -refill_cluster: - BUG_ON(used_block_group != block_group); +release_cluster: /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -5508,8 +6648,10 @@ refill_cluster: * succeeding in the unclustered * allocation. */ if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); goto unclustered_alloc; } @@ -5519,24 +6661,34 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - search_start, num_bytes, - empty_cluster + empty_size); + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this * cluster */ offset = btrfs_alloc_from_cluster(block_group, - last_ptr, num_bytes, - search_start); + last_ptr, + num_bytes, + search_start, + &max_extent_size); if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); @@ -5571,13 +6723,18 @@ unclustered_alloc: if (cached && block_group->free_space_ctl->free_space < num_bytes + empty_cluster + empty_size) { + if (block_group->free_space_ctl->free_space > + max_extent_size) + max_extent_size = + block_group->free_space_ctl->free_space; spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size); + num_bytes, empty_size, + &max_extent_size); /* * If we didn't find a chunk, and we haven't failed on this * block group before, and this block group is in the middle of @@ -5599,29 +6756,25 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } + search_start = stripe_align(root, block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, - alloc_type); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -5631,21 +6784,13 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); @@ -5655,9 +6800,7 @@ loop: if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5667,65 +6810,37 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent) - goto search; - - /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. - */ - loop = LOOP_CACHING_WAIT; - goto search; - } - loop++; - if (loop == LOOP_ALLOC_CHUNK) { - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - allowed_chunk_alloc = 0; - if (ret == 1) - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == - CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; } - /* - * We didn't allocate a chunk, go ahead and drop the - * empty size and loop again. - */ - if (!done_chunk_alloc) - loop = LOOP_NO_EMPTY_SIZE; + ret = do_chunk_alloc(trans, root, flags, + CHUNK_ALLOC_FORCE); + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, + root, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans, root); + if (ret) + goto out; } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -5739,7 +6854,9 @@ loop: } else if (ins->objectid) { ret = 0; } - +out: + if (ret == -ENOSPC) + ins->offset = max_extent_size; return ret; } @@ -5750,20 +6867,16 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", - (unsigned long long)info->flags, - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_readonly), + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_reserved, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_readonly); + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) @@ -5773,13 +6886,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -5788,69 +6900,54 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, int is_data, int delalloc) { bool final_tried = false; + u64 flags; int ret; - u64 search_start = 0; - data = btrfs_get_alloc_profile(root, data); + flags = btrfs_get_alloc_profile(root, is_data); again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, - CHUNK_ALLOC_NO_FORCE); - WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, - ins, data); + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags, delalloc); if (ret == -ENOSPC) { - if (!final_tried) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); + num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); if (num_bytes == min_alloc_size) final_tried = true; goto again; } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + sinfo = __find_space_info(root->fs_info, flags); + btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", + flags, num_bytes); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); } } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); - return ret; } static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { - printk(KERN_ERR "Unable to find block group for %llu\n", - (unsigned long long)start); + btrfs_err(root->fs_info, "Unable to find block group for %llu", + start); return -ENOSPC; } @@ -5861,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, pin_down_extent(root, cache, start, len, 1); else { btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } btrfs_put_block_group(cache); @@ -5871,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, } int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) + u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0); + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); } int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1); + return __btrfs_free_reserved_extent(root, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -5911,7 +7008,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5940,13 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + + ret = update_block_group(root, ins->objectid, ins->offset, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); BUG(); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); return ret; } @@ -5954,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -5963,16 +7071,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!skinny_metadata) + size += sizeof(*block_info); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); return -ENOMEM; + } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5981,12 +7103,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, @@ -6001,13 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + + ret = update_block_group(root, ins->objectid, root->leafsize, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); BUG(); } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); return ret; } @@ -6039,58 +7175,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); - caching_ctl = get_caching_control(block_group); - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); - - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); - } - - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); + if (ret) + return ret; } + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); - BUG_ON(ret); - btrfs_put_block_group(block_group); + RESERVE_ALLOC_NO_ACCOUNT, 0); + BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); + btrfs_put_block_group(block_group); return ret; } -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level) +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u32 blocksize, int level) { struct extent_buffer *buf; @@ -6101,6 +7212,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -6132,48 +7244,51 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; } - ret = block_rsv_use_bytes(block_rsv, blocksize); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; - if (ret) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; } - - return ERR_PTR(-ENOSPC); + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, @@ -6194,21 +7309,31 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) + u64 hint, u64 empty_size) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; u64 flags = 0; int ret; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); - +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, (u64)-1, &ins, 0); + ret = btrfs_reserve_extent(root, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -6216,7 +7341,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); - BUG_ON(IS_ERR(buf)); + BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6227,23 +7352,27 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + extent_op = btrfs_alloc_delayed_extent_op(); + BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - extent_op->update_key = 1; + if (skinny_metadata) + extent_op->update_key = 0; + else + extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; + extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); - BUG_ON(ret); + extent_op, 0); + BUG_ON(ret); /* -ENOMEM */ } return buf; } @@ -6311,9 +7440,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); - BUG_ON(ret); + ret = btrfs_lookup_extent_info(trans, root, bytenr, + wc->level - 1, 1, &refs, + &flags); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; BUG_ON(refs == 0); if (wc->stage == DROP_REFERENCE) { @@ -6347,7 +7479,7 @@ reada: } /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. @@ -6377,10 +7509,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; BUG_ON(wc->refs[level] == 0); } @@ -6399,12 +7533,13 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); - BUG_ON(ret); + eb->len, flag, + btrfs_header_level(eb), 0); + BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -6420,7 +7555,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* - * hepler to process tree block pointer. + * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block @@ -6468,16 +7603,25 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); reada = 1; } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); - BUG_ON(ret); - BUG_ON(wc->refs[level - 1] == 0); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(root->fs_info, "Missing references."); + BUG(); + } *lookup_info = 0; if (wc->stage == DROP_REFERENCE) { @@ -6505,7 +7649,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto skip; } - if (!btrfs_buffer_uptodate(next, generation)) { + if (!btrfs_buffer_uptodate(next, generation, 0)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; @@ -6516,8 +7660,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); - if (!next) + if (!next || !extent_buffer_uptodate(next)) { + free_extent_buffer(next); return -EIO; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -6545,7 +7691,7 @@ skip: ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); free_extent_buffer(next); @@ -6554,7 +7700,7 @@ skip: } /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. @@ -6600,13 +7746,18 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return ret; + } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } @@ -6623,7 +7774,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -6649,7 +7800,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6731,8 +7882,10 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN */ -void btrfs_drop_snapshot(struct btrfs_root *root, +int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc) { @@ -6745,6 +7898,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, int err = 0; int ret; int level; + bool root_dropped = false; path = btrfs_alloc_path(); if (!path) { @@ -6760,7 +7914,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root, } trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; @@ -6785,7 +7942,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out_free; + goto out_end_trans; } WARN_ON(ret > 0); @@ -6799,19 +7956,23 @@ void btrfs_drop_snapshot(struct btrfs_root *root, while (1) { btrfs_tree_lock(path->nodes[level]); btrfs_set_lock_blocking(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, - path->nodes[level]->len, - &wc->refs[level], + level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_end_trans; + } BUG_ON(wc->refs[level] == 0); if (level == root_item->drop_level) break; btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } @@ -6826,6 +7987,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { + ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { err = ret; @@ -6852,30 +8014,51 @@ void btrfs_drop_snapshot(struct btrfs_root *root, } BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root)) { + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("BTRFS: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; } } btrfs_release_path(path); - BUG_ON(err); + if (err) + goto out_end_trans; ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); - BUG_ON(ret < 0); - if (ret > 0) { + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { /* if we fail to delete the orphan item this time * around, it'll get picked up the next time. * @@ -6886,21 +8069,32 @@ void btrfs_drop_snapshot(struct btrfs_root *root, } } - if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } -out_free: + root_dropped = true; +out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); +out_free: kfree(wc); btrfs_free_path(path); out: - if (err) + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (!for_reloc && root_dropped == false) + btrfs_add_dead_root(root); + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); - return; + return err; } /* @@ -6977,31 +8171,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - /* pick restriper's target profile and return */ - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } + u64 stripped; - if (tgt) { - /* extended -> chunk profile */ - tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return tgt; - } - } + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); /* * we add in the count of missing devices because we want @@ -7011,6 +8189,10 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -7023,7 +8205,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; - return flags; } else { /* they already had raid on here, just return */ if (flags & stripped) @@ -7036,9 +8217,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DUP) return stripped | BTRFS_BLOCK_GROUP_RAID1; - /* turn single device chunks into raid0 */ - return stripped | BTRFS_BLOCK_GROUP_RAID0; + /* this is drive concat, leave it alone */ } + return flags; } @@ -7097,18 +8278,22 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7122,7 +8307,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type) { u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + return do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); } @@ -7172,7 +8357,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) spin_lock(&sinfo->lock); - for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) if (!list_empty(&sinfo->block_groups[i])) free_bytes += __btrfs_get_ro_block_group_free_space( &sinfo->block_groups[i]); @@ -7182,7 +8367,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -int btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -7198,7 +8383,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root, cache->ro = 0; spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); - return 0; } /* @@ -7213,9 +8397,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; + u64 target; int index; int full = 0; int ret = 0; @@ -7256,13 +8442,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * ok we don't have enough space, but maybe we have free space on our * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. However, if we - * were marked as full, then we know there aren't enough chunks, and we - * can just return. + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. */ ret = -1; - if (full) - goto out; /* * index: @@ -7272,21 +8456,41 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * 3: raid0 * 4: single */ - index = get_block_group_index(block_group); - if (index == 0) { + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_raid_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + + if (index == BTRFS_RAID_RAID10) { dev_min = 4; /* Divide by 2 */ min_free >>= 1; - } else if (index == 1) { + } else if (index == BTRFS_RAID_RAID1) { dev_min = 2; - } else if (index == 2) { + } else if (index == BTRFS_RAID_DUP) { /* Multiply by 2 */ min_free <<= 1; - } else if (index == 3) { + } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; do_div(min_free, dev_min); } + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -7295,8 +8499,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(device, min_free, + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { + ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7308,6 +8513,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -7390,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -7418,7 +8624,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. */ - if (block_group->cached == BTRFS_CACHE_NO) + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); @@ -7438,18 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); - while(!list_empty(&info->space_info)) { + while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) { + dump_space_info(space_info, 0, 0); + } } list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -7458,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info, struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); + + if (first) { + struct raid_kobject *rkobj; + int ret; + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + goto out_err; + } + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; } int btrfs_read_block_groups(struct btrfs_root *root) @@ -7499,48 +8780,56 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - if (need_clear) + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -7563,10 +8852,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) free_excluded_extents(root, cache); } + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + btrfs_put_block_group(cache); + goto error; + } + cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7574,9 +8879,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); - set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) set_block_group_ro(cache, 1); @@ -7586,15 +8888,21 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!(get_alloc_profile(root, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) continue; /* * avoid allocating from un-mirrored block group if there are * mirrored block groups. */ - list_for_each_entry(cache, &space_info->block_groups[3], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) set_block_group_ro(cache, 1); - list_for_each_entry(cache, &space_info->block_groups[4], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) set_block_group_ro(cache, 1); } @@ -7605,6 +8913,38 @@ error: return ret; } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, + new_bg_list) { + list_del_init(&block_group->new_bg_list); + + if (ret) + continue; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + } +} + int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7616,48 +8956,53 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); free_excluded_extents(root, cache); + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return ret; + } update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -7666,12 +9011,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); - - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - BUG_ON(ret); + list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -7680,18 +9020,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -7703,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; @@ -7752,7 +9092,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); @@ -7787,6 +9130,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -7795,9 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); @@ -7886,9 +9239,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - cache = btrfs_lookup_block_group(fs_info, range->start); + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { @@ -7902,9 +9262,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); - if (!ret) - wait_block_group_cache_done(cache); + ret = cache_block_group(cache, 0); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } } ret = btrfs_trim_block_group(cache, &group_trimmed, @@ -7925,3 +9292,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) range->len = trimmed; return ret; } + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ + if (unlikely(atomic_read(&root->will_be_snapshoted))) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (unlikely(atomic_read(&root->will_be_snapshoted))) { + btrfs_end_nocow_write(root); + return 0; + } + return 1; +} |
