From db804f23a72bada58f083dfad6a65d019ddb3bd4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 10 Jan 2012 16:41:01 +0800 Subject: Btrfs: add pinned extents to on-disk free space cache correctly I got this while running xfstests: [24256.836098] block group 317849600 has an wrong amount of free space [24256.836100] btrfs: failed to load free space cache for block group 317849600 We should clamp the extent returned by find_first_extent_bit(), so the start of the extent won't smaller than the start of the block group. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec23d43d0c3..01840ef95a3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -838,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; - u64 start, end, len; + u64 start, extent_start, extent_end, len; int entries = 0; int bitmaps = 0; int ret; @@ -857,25 +857,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_cluster, block_group_list); - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - /* Lock all pages first so we can lock the extent safely. */ io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); - /* - * When searching for pinned extents, we need to start at our start - * offset. - */ - if (block_group) - start = block_group->key.objectid; - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); @@ -918,9 +905,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * We want to add any pinned extents to our free space cache * so we don't leak the space */ + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + if (block_group) + start = block_group->key.objectid; + while (block_group && (start < block_group->key.objectid + block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, EXTENT_DIRTY); if (ret) { ret = 0; @@ -928,20 +926,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + + if (extent_start >= block_group->key.objectid + block_group->key.offset) break; - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; entries++; - ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); if (ret) goto out_nospc; - start = end + 1; + start = extent_end; } /* Write out the bitmaps */ -- cgit v1.2.3-18-g5258 From a1ee5a45818acc7f9c13e560827cf3e8735ac919 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jan 2012 14:27:42 +0800 Subject: Btrfs: avoid possible NULL deref in io_ctl_drop_pages() If we run into some failure path in io_ctl_prepare_pages(), io_ctl->pages[] array may have some NULL pointers. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01840ef95a3..4e55af333e1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } } } -- cgit v1.2.3-18-g5258 From 706efc6630c2722602541a6a2fc5900a4e38456a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jan 2012 14:36:28 +0800 Subject: Btrfs: check the return value of io_ctl_init() It can return -ENOMEM. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4e55af333e1..e4eb222147c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -637,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return ret; + ret = readahead_cache(inode); if (ret) goto out; @@ -851,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return -1; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) -- cgit v1.2.3-18-g5258 From f062abf089ff705e09bbaa6fa1e2fd7688a0f2ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 13:36:45 +0800 Subject: Btrfs: remove BUG_ON()s in btrfs_ioctl_setflags() We can recover from errors and return -errno to user space. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5b..9619fb03a5d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; + u64 ip_oldflags; + unsigned int i_oldflags; if (btrfs_root_readonly(root)) return -EROFS; @@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) mutex_lock(&inode->i_mutex); + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -250,18 +255,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } btrfs_update_iflags(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } mnt_drop_write(file->f_path.mnt); - - ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3-18-g5258 From 4da6f1a332f6c16b6594c7892f13c31459b9b1c8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 13:39:50 +0800 Subject: Btrfs: reserve metadata space in btrfs_ioctl_setflags() Check and reserve space for btrfs_update_inode(). Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9619fb03a5d..fe8a60c865e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -254,7 +254,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root); + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_drop; -- cgit v1.2.3-18-g5258 From 125ccb0ae6806dbec31abf4a85448971df3b4e39 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Dec 2011 15:07:24 +0800 Subject: Btrfs: don't pass a trans handle unnecessarily in volumes.c Some functions never use the transaction handle passed to them. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/volumes.c | 18 +++++++----------- fs/btrfs/volumes.h | 3 +-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8603ee4e3df..5b53479ce07 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7084,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..73f673c8d8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -829,7 +829,6 @@ out: /* * find_free_dev_extent - find free space in the specified device - * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,8 +847,7 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -893,7 +891,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1469,8 +1467,7 @@ error_undo: /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1695,7 +1692,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); + ret = btrfs_prepare_sprout(root); BUG_ON(ret); } @@ -2323,8 +2320,7 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -2496,7 +2492,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -2687,7 +2683,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); BUG_ON(ret); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37..c1701ec9d49 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -230,7 +230,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_root *dev_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif -- cgit v1.2.3-18-g5258 From de11cc12df17337979e0929d2831887432f236ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 12:55:47 +0800 Subject: Btrfs: don't pre-allocate btrfs bio We pre-allocate a btrfs bio with fixed size, and then may re-allocate memory if we find stripes are bigger than the fixed size. But this pre-allocation is not necessary. Also we don't have to calcuate the stripe number twice. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 67 +++++++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f673c8d8d..540fdd25fb5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2897,26 +2897,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; - int stripes_allocated = 8; - int stripes_required = 1; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; - if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) - stripes_allocated = 1; -again: - if (bbio_ret) { - bbio = kzalloc(btrfs_bio_size(stripes_allocated), - GFP_NOFS); - if (!bbio) - return -ENOMEM; - - atomic_set(&bbio->error, 0); - } - read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -2935,32 +2922,6 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our btrfs_bio struct is too small, back off and try again */ - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { - stripes_required = map->num_stripes; - } - } - if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(bbio); - goto again; - } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -3055,6 +3016,13 @@ again: } BUG_ON(stripe_index >= map->num_stripes); + bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = @@ -3151,15 +3119,22 @@ again: stripe_index++; } } - if (bbio_ret) { - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; out: free_extent_map(em); - return 0; + return ret; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, -- cgit v1.2.3-18-g5258 From ec9ef7a13be4dcce964c8503e8999087945e5b9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 14:06:42 +0800 Subject: Btrfs: simplfy calculation of stripe length for discard operation For btrfs raid, while discarding a range of space, we'll need to know the start offset and length to discard for each device, and it's done in btrfs_map_block(). However the calculation is a bit complex for raid0 and raid10, so I reimplement it based on a fact that: dev1 dev2 dev3 (raid0) ----------------------------------- s0 s3 s6 s1 s4 s7 s2 s5 Each device has (total_stripes / nr_dev) stripes, or plus one. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 95 ++++++++++++++++++------------------------------------ 1 file changed, 31 insertions(+), 64 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 540fdd25fb5..563ef650850 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3024,80 +3024,47 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, atomic_set(&bbio->error, 0); if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + } + for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - u64 stripes; - u32 last_stripe = 0; - int j; - - div_u64_rem(stripe_nr_end - 1, - map->num_stripes, - &last_stripe); - - for (j = 0; j < map->num_stripes; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - map->num_stripes, &test); - if (test == stripe_index) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, map->num_stripes); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i == 0) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - stripe_offset = 0; - } - if (stripe_index == last_stripe) - bbio->stripes[i].length -= - stripe_end_offset; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u64 stripes; - int j; - int factor = map->num_stripes / - map->sub_stripes; - u32 last_stripe = 0; - - div_u64_rem(stripe_nr_end - 1, - factor, &last_stripe); - last_stripe *= map->sub_stripes; - - for (j = 0; j < factor; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - factor, &test); - - if (test == - stripe_index / map->sub_stripes) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, factor); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i < map->sub_stripes) { - bbio->stripes[i].length -= - stripe_offset; - if (i == map->sub_stripes - 1) - stripe_offset = 0; - } - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - map->sub_stripes - 1)) { + if ((i / sub_stripes + 1) % + sub_stripes == remaining_stripes) bbio->stripes[i].length -= stripe_end_offset; - } + if (i == sub_stripes - 1) + stripe_offset = 0; } else bbio->stripes[i].length = *length; -- cgit v1.2.3-18-g5258 From 7fe1e641502616220437079258506196bc4d8cbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 14:47:27 +0800 Subject: Btrfs: rewrite btrfs_trim_block_group() There are various bugs in block group trimming: - It may trim from offset smaller than user-specified offset. - It may trim beyond user-specified range. - It may leak free space for extents smaller than specified minlen. - It may truncate the last trimmed extent thus leak free space. - With mixed extents+bitmaps, some extents may not be trimmed. - With mixed extents+bitmaps, some bitmaps may not be trimmed (even none will be trimmed). Even for those trimmed, not all the free space in the bitmaps will be trimmed. I rewrite btrfs_trim_block_group() and break it into two functions. One is to trim extents only, and the other is to trim bitmaps only. Before patching: # fstrim -v /mnt/ /mnt/: 1496465408 bytes were trimmed After patching: # fstrim -v /mnt/ /mnt/: 2193768448 bytes were trimmed And this matches the total free space: # btrfs fi df /mnt Data: total=3.58GB, used=1.79GB System, DUP: total=8.00MB, used=4.00KB System: total=4.00MB, used=0.00 Metadata, DUP: total=205.12MB, used=97.14MB Metadata: total=8.00MB, used=0.00 Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 235 +++++++++++++++++++++++++++++++------------- 1 file changed, 164 insertions(+), 71 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e4eb222147c..b3cbb8939fa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2594,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; + struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 bytes = 0; - u64 actually_trimmed; - int ret = 0; + int ret; + int update = 0; + u64 trimmed = 0; - *trimmed = 0; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; while (start < end) { spin_lock(&ctl->tree_lock); @@ -2615,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, } entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) - entry = tree_search_offset(ctl, - offset_to_bitmap(ctl, start), - 1, 1); - - if (!entry || entry->offset >= end) { + if (!entry) { spin_unlock(&ctl->tree_lock); break; } - if (entry->bitmap) { - ret = search_bitmap(ctl, entry, &start, &bytes); - if (!ret) { - if (start >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - bytes = min(bytes, end - start); - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } else { - start = entry->offset + BITS_PER_BITMAP * - block_group->sectorsize; + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { spin_unlock(&ctl->tree_lock); - ret = 0; - continue; + goto out; } - } else { - start = entry->offset; - bytes = min(entry->bytes, end - start); - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; } + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + spin_unlock(&ctl->tree_lock); - if (bytes >= minlen) { - struct btrfs_space_info *space_info; - int update = 0; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += bytes; - space_info->bytes_reserved += bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, - bytes, - &actually_trimmed); - - btrfs_add_free_space(block_group, start, bytes); - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += bytes; - block_group->reserved -= bytes; - space_info->bytes_reserved -= bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes); + if (ret) + break; +next: + start += bytes; - if (ret) - break; - *trimmed += actually_trimmed; + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } +out: + return ret; +} + +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + + while (offset < end) { + bool next_bitmap = false; + + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + } else { + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; } - start += bytes; - bytes = 0; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -2702,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, return ret; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + return ret; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. -- cgit v1.2.3-18-g5258 From c7c144db531fda414e532adac56e965ce332e2a5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 10:39:22 +0800 Subject: Btrfs: update global block_rsv when creating a new block group A bug was triggered while using seed device: # mkfs.btrfs /dev/loop1 # btrfstune -S 1 /dev/loop1 # mount -o /dev/loop1 /mnt # btrfs dev add /dev/loop2 /mnt btrfs: block rsv returned -28 ------------[ cut here ]------------ WARNING: at fs/btrfs/extent-tree.c:5969 btrfs_alloc_free_block+0x166/0x396 [btrfs]() ... Call Trace: ... [] btrfs_cow_block+0x101/0x147 [btrfs] [] btrfs_search_slot+0x1b8/0x55f [btrfs] [] btrfs_insert_empty_items+0x42/0x7f [btrfs] [] btrfs_insert_item+0x40/0x7e [btrfs] [] btrfs_make_block_group+0x243/0x2aa [btrfs] [] __btrfs_alloc_chunk+0x672/0x70e [btrfs] [] init_first_rw_device+0x77/0x13c [btrfs] [] btrfs_init_new_device+0x664/0x9fd [btrfs] [] btrfs_ioctl+0x694/0xdbe [btrfs] [] do_vfs_ioctl+0x496/0x4cc [] sys_ioctl+0x33/0x4f [] sysenter_do_call+0x12/0x38 ---[ end trace 906adac595facc7d ]--- Since seed device is readonly, there's no usable space in the filesystem. Afterwards we add a sprout device to it, and the kernel creates a METADATA block group and a SYSTEM block group where comes free space we can reserve, but we still get revervation failure because the global block_rsv hasn't been updated accordingly. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b53479ce07..bf30f670cda 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7446,6 +7446,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; -- cgit v1.2.3-18-g5258 From b367e47fb3a70f5d24ebd6faf7d42436d485fb2d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 11:38:24 +0800 Subject: Btrfs: fix possible deadlock when opening a seed device The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex, but when we mount a filesystem which has backing seed devices, we have this lock chain: open_ctree() lock(chunk_mutex); read_chunk_tree(); read_one_dev(); open_seed_devices(); lock(uuid_mutex); and then we hit a lockdep splat. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/volumes.c | 9 +++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e58..858ab347413 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2270,9 +2270,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 563ef650850..fbb493b28d5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3506,7 +3506,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3544,7 +3544,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3687,6 +3686,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&uuid_mutex); + lock_chunks(root); + /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -3737,6 +3739,9 @@ again: } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } -- cgit v1.2.3-18-g5258