From 1bec1aed1e7e632b3cc43b6807c2b4dcd1572e28 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 22 Jul 2009 09:59:00 -0400 Subject: Btrfs: fix definition of struct btrfs_extent_inline_ref use __le64 instead of u64 in on-disk structure definition. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a404ecc53eb..da0763135bf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -483,7 +483,7 @@ struct btrfs_shared_data_ref { struct btrfs_extent_inline_ref { u8 type; - u64 offset; + __le64 offset; } __attribute__ ((__packed__)); /* old style backrefs item */ -- cgit v1.2.3-18-g5258 From 963030817060e4f109be1993b9ae8f81dbf5e11a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 Jul 2009 21:29:25 -0400 Subject: Btrfs: use hybrid extents+bitmap rb tree for free space Currently btrfs has a problem where it can use a ridiculous amount of RAM simply tracking free space. As free space gets fragmented, we end up with thousands of entries on an rb-tree per block group, which usually spans 1 gig of area. Since we currently don't ever flush free space cache back to disk this gets to be a bit unweildly on large fs's with lots of fragmentation. This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free space cache. Initially we calculate a threshold of extent entries we can handle, which is however many extent entries we can cram into 16k of ram. The maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace will be 32k of RAM, which scales much better than we did before. Once we pass the extent threshold, we start adding bitmaps and using those instead for tracking the free space. This patch also makes it so that any free space thats less than 4 * sectorsize we go ahead and put into a bitmap. This is nice since we try and allocate out of the front of a block group, so if the front of a block group is heavily fragmented and then has a huge chunk of free space at the end, we go ahead and add the fragmented areas to bitmaps and use a normal extent entry to track the big chunk at the back of the block group. I've also taken the opportunity to revamp how we search for free space. Previously we indexed free space via an offset indexed rb tree and a bytes indexed rb tree. I've dropped the bytes indexed rb tree and use only the offset indexed rb tree. This cuts the number of tree operations we were doing previously down by half, and gives us a little bit of a better allocation pattern since we will always start from a specific offset and search forward from there, instead of searching for the size we need and try and get it as close as possible to the offset we want. I've given this a healthy amount of testing pre-new format stuff, as well as post-new format stuff. I've booted up my fedora box which is installed on btrfs with this patch and ran with it for a few days without issues. I've not seen any performance regressions in any of my tests. Since the last patch Yan Zheng fixed a problem where we could have overlapping entries, so updating their offset inline would cause problems. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index da0763135bf..0cbf3491bb7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -709,6 +709,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -726,6 +729,10 @@ struct btrfs_block_group_cache { u64 pinned; u64 reserved; u64 flags; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; int cached; int ro; int dirty; @@ -734,7 +741,6 @@ struct btrfs_block_group_cache { /* free space cache stuff */ spinlock_t tree_lock; - struct rb_root free_space_bytes; struct rb_root free_space_offset; /* block group cache stuff */ -- cgit v1.2.3-18-g5258 From 817d52f8dba26d0295c26035531c30ce5f1e3c3e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 Jul 2009 21:29:25 -0400 Subject: Btrfs: async block group caching This patch moves the caching of the block group off to a kthread in order to allow people to allocate sooner. Instead of blocking up behind the caching mutex, we instead kick of the caching kthread, and then attempt to make an allocation. If we cannot, we wait on the block groups caching waitqueue, which the caching kthread will wake the waiting threads up everytime it finds 2 meg worth of space, and then again when its finished caching. This is how I tested the speedup from this mkfs the disk mount the disk fill the disk up with fs_mark unmount the disk mount the disk time touch /mnt/foo Without my changes this took 11 seconds on my box, with these changes it now takes 1 second. Another change thats been put in place is we lock the super mirror's in the pinned extent map in order to keep us from adding that stuff as free space when caching the block group. This doesn't really change anything else as far as the pinned extent map is concerned, since for actual pinned extents we use EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock those extents to keep from leaking memory. I've also added a check where when we are reading block groups from disk, if the amount of space used == the size of the block group, we go ahead and mark the block group as cached. This drastically reduces the amount of time it takes to cache the block groups. Using the same test as above, except doing a dd to a file and then unmounting, it used to take 33 seconds to umount, now it takes 3 seconds. This version uses the commit_root in the caching kthread, and then keeps track of how many async caching threads are running at any given time so if one of the async threads is still running as we cross transactions we can wait until its finished before handling the pinned extents. Thank you, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0cbf3491bb7..42b03c4ee49 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -691,6 +691,7 @@ struct btrfs_space_info { struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; + atomic_t caching_threads; }; /* @@ -721,11 +722,17 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; spinlock_t lock; - struct mutex cache_mutex; u64 pinned; u64 reserved; u64 flags; @@ -733,15 +740,19 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int cached; int ro; int dirty; + /* cache tracking stuff */ + wait_queue_head_t caching_q; + int cached; + struct btrfs_space_info *space_info; /* free space cache stuff */ spinlock_t tree_lock; struct rb_root free_space_offset; + u64 free_space; /* block group cache stuff */ struct rb_node cache_node; @@ -834,6 +845,7 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t async_caching_threads; /* * this is used by the balancing code to wait for all the pending @@ -950,6 +962,9 @@ struct btrfs_root { /* the node lock is held while changing the node pointer */ spinlock_t node_lock; + /* taken when updating the commit root */ + struct rw_semaphore commit_root_sem; + struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -1911,7 +1926,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin); + u64 bytenr, u64 num, int pin, int mark_free); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, @@ -1996,6 +2011,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); -- cgit v1.2.3-18-g5258 From 68b38550ddbea13d296184bf69edff387618b1d3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 27 Jul 2009 13:57:01 -0400 Subject: Btrfs: change how we unpin extents We are racy with async block caching and unpinning extents. This patch makes things much less complicated by only unpinning the extent if the block group is cached. We check the block_group->cached var under the block_group->lock spin lock. If it is set to BTRFS_CACHE_FINISHED then we update the pinned counters, and unpin the extent and add the free space back. If it is not set to this, we start the caching of the block group so the next time we unpin extents we can unpin the extent. This keeps us from racing with the async caching threads, lets us kill the fs wide async thread counter, and keeps us from having to set DELALLOC bits for every extent we hit if there are caching kthreads going. One thing that needed to be changed was btrfs_free_super_mirror_extents. Now instead of just looking for LOCKED extents, we also look for DIRTY extents, since we could have left some extents pinned in the previous transaction that will never get freed now that we are unmounting, which would cause us to leak memory. So btrfs_free_super_mirror_extents has been changed to btrfs_free_pinned_extents, and it will clear the extents locked for the super mirror, and any remaining pinned extents that may be present. Thank you, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ctree.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 42b03c4ee49..17ad92c29cf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -845,7 +845,6 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; - atomic_t async_caching_threads; /* * this is used by the balancing code to wait for all the pending @@ -1926,7 +1925,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin, int mark_free); + u64 bytenr, u64 num, int pin); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, @@ -2011,7 +2010,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); -void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); -- cgit v1.2.3-18-g5258