aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZheng Yan <zheng.yan@oracle.com>2008-09-26 10:09:34 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-26 10:09:34 -0400
commit1a40e23b95da45051ee4d74374c58ae87a14051c (patch)
tree77faffd3f9d3a26c22e6cf03b83762c95d687596
parent5b21f2ed3f2947b5195b65c9fdbdd9e52904cc03 (diff)
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new backref format. Before, btrfs-vol -b would break any COW links on data blocks or metadata. This was slow and caused the amount of space used to explode if a large number of snapshots were present. The new code can keeps the sharing of all data extents and most of the tree blocks. To maintain the sharing of data extents, the space balance code uses a seperate inode hold data extent pointers, then updates the references to point to the new location. To maintain the sharing of tree blocks, the space balance code uses reloc trees to relocate tree blocks in reference counted roots. There is one reloc tree for each subvol, and all reloc trees share same root key objectid. Reloc trees are snapshots of the latest committed roots of subvols (root->commit_root). To relocate a tree block referenced by a subvol, there are two steps. COW the block through subvol's reloc tree, then update block pointer in the subvol to point to the new block. Since all reloc trees share same root key objectid, doing special handing for tree blocks owned by them is easy. Once a tree block has been COWed in one reloc tree, we can use the resulting new block directly when the same block is required to COW again through other reloc trees. In this way, relocated tree blocks are shared between reloc trees, so they are also shared between subvols. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.c155
-rw-r--r--fs/btrfs/ctree.h26
-rw-r--r--fs/btrfs/disk-io.c9
-rw-r--r--fs/btrfs/extent-tree.c2034
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/volumes.c9
7 files changed, 1828 insertions, 425 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f9cd40967d0..50e81f43e6d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -179,7 +179,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *cow;
u32 nritems;
int ret = 0;
- int different_trans = 0;
int level;
int unlock_orig = 0;
@@ -233,13 +232,33 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (btrfs_header_generation(buf) != trans->transid) {
u32 nr_extents;
- different_trans = 1;
ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
if (ret)
return ret;
ret = btrfs_cache_ref(trans, root, buf, nr_extents);
WARN_ON(ret);
+ } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+ /*
+ * There are only two places that can drop reference to
+ * tree blocks owned by living reloc trees, one is here,
+ * the other place is btrfs_merge_path. In both places,
+ * we check reference count while tree block is locked.
+ * Furthermore, if reference count is one, it won't get
+ * increased by someone else.
+ */
+ u32 refs;
+ ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+ buf->len, &refs);
+ BUG_ON(ret);
+ if (refs == 1) {
+ ret = btrfs_update_ref(trans, root, buf, cow,
+ 0, nritems);
+ clean_tree_block(trans, root, buf);
+ } else {
+ ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+ }
+ BUG_ON(ret);
} else {
ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
if (ret)
@@ -247,6 +266,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, buf);
}
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_add_reloc_mapping(root, buf->start,
+ buf->len, cow->start);
+ BUG_ON(ret);
+ ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+ WARN_ON(ret);
+ }
+
if (buf == root->node) {
WARN_ON(parent && parent != buf);
@@ -1466,6 +1493,130 @@ done:
return ret;
}
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *node_keys,
+ u64 *nodes, int lowest_level)
+{
+ struct extent_buffer *eb;
+ struct extent_buffer *parent;
+ struct btrfs_key key;
+ u64 bytenr;
+ u64 generation;
+ u32 blocksize;
+ int level;
+ int slot;
+ int key_match;
+ int ret;
+
+ eb = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ BUG_ON(ret);
+
+ parent = eb;
+ while (1) {
+ level = btrfs_header_level(parent);
+ if (level == 0 || level <= lowest_level)
+ break;
+
+ ret = bin_search(parent, &node_keys[lowest_level], level,
+ &slot);
+ if (ret && slot > 0)
+ slot--;
+
+ bytenr = btrfs_node_blockptr(parent, slot);
+ if (nodes[level - 1] == bytenr)
+ break;
+
+ blocksize = btrfs_level_size(root, level - 1);
+ generation = btrfs_node_ptr_generation(parent, slot);
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+ /*
+ * if node keys match and node pointer hasn't been modified
+ * in the running transaction, we can merge the path. for
+ * reloc trees, the node pointer check is skipped, this is
+ * because the reloc trees are fully controlled by the space
+ * balance code, no one else can modify them.
+ */
+ if (!nodes[level - 1] || !key_match ||
+ (generation == trans->transid &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
+next_level:
+ if (level == 1 || level == lowest_level + 1)
+ break;
+
+ eb = read_tree_block(root, bytenr, blocksize,
+ generation);
+ btrfs_tree_lock(eb);
+
+ ret = btrfs_cow_block(trans, root, eb, parent, slot,
+ &eb, 0);
+ BUG_ON(ret);
+
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ parent = eb;
+ continue;
+ }
+
+ if (generation == trans->transid) {
+ u32 refs;
+ BUG_ON(btrfs_header_owner(eb) !=
+ BTRFS_TREE_RELOC_OBJECTID);
+ /*
+ * lock the block to keep __btrfs_cow_block from
+ * changing the reference count.
+ */
+ eb = read_tree_block(root, bytenr, blocksize,
+ generation);
+ btrfs_tree_lock(eb);
+
+ ret = btrfs_lookup_extent_ref(trans, root, bytenr,
+ blocksize, &refs);
+ BUG_ON(ret);
+ /*
+ * if replace block whose reference count is one,
+ * we have to "drop the subtree". so skip it for
+ * simplicity
+ */
+ if (refs == 1) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ goto next_level;
+ }
+ }
+
+ btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+ btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ nodes[level - 1],
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ level - 1, 0);
+ BUG_ON(ret);
+ ret = btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ level - 1, 0, 1);
+ BUG_ON(ret);
+
+ if (generation == trans->transid) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ break;
+ }
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ return 0;
+}
+
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e62a1b0a1f..2775e270881 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -604,6 +604,7 @@ struct btrfs_fs_info {
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
+ struct mutex tree_reloc_mutex;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
@@ -647,6 +648,10 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
+ /* tree relocation relocated fields */
+ struct extent_io_tree reloc_mapping_tree;
+ struct list_head dead_reloc_roots;
+ struct btrfs_leaf_ref_tree reloc_ref_tree;
struct btrfs_leaf_ref_tree shared_ref_tree;
struct kobject super_kobj;
@@ -698,6 +703,7 @@ struct btrfs_root {
struct btrfs_leaf_ref_tree ref_tree_struct;
struct btrfs_dirty_root *dirty_root;
struct btrfs_root *log_root;
+ struct btrfs_root *reloc_root;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
@@ -1517,7 +1523,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1582,10 +1587,29 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+ u64 num_bytes, u64 new_bytenr);
+int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+ u64 num_bytes, u64 *new_bytenr);
+void btrfs_free_reloc_mappings(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
/* ctree.c */
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *node_keys,
+ u64 *nodes, int lowest_level);
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *new_key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8969fee2331..45bc3132b05 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1406,6 +1406,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
+ extent_io_tree_init(&fs_info->reloc_mapping_tree,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+ btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1421,6 +1425,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->tree_reloc_mutex);
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1627,6 +1632,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
ret = btrfs_recover_log_trees(log_tree_root);
BUG_ON(ret);
}
+
+ ret = btrfs_cleanup_reloc_trees(tree_root);
+ BUG_ON(ret);
+
fs_info->last_trans_committed = btrfs_super_generation(disk_super);
return tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ab099bc01a..8043b9d584a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1834,6 +1834,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
u64 header_owner = btrfs_header_owner(buf);
u64 header_transid = btrfs_header_generation(buf);
if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+ header_owner != BTRFS_TREE_RELOC_OBJECTID &&
header_transid == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
clean_tree_block(NULL, root, buf);
@@ -2487,6 +2488,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
return -ENOSPC;
}
btrfs_add_free_space(cache, start, len);
+ update_reserved_extents(root, start, len, 0);
maybe_unlock_mutex(root);
return 0;
}
@@ -2947,6 +2949,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
*/
if (*level == 1) {
ref = btrfs_lookup_leaf_ref(root, bytenr);
+ if (ref && ref->generation != ptr_gen) {
+ btrfs_free_leaf_ref(root, ref);
+ ref = NULL;
+ }
if (ref) {
ret = cache_drop_leaf_ref(trans, root, ref);
BUG_ON(ret);
@@ -3153,34 +3159,6 @@ out:
return ret;
}
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
- struct btrfs_block_group_cache *block_group;
- struct rb_node *n;
-
- mutex_lock(&info->alloc_mutex);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
- block_group = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
-
- spin_unlock(&info->block_group_cache_lock);
- btrfs_remove_free_space_cache(block_group);
- spin_lock(&info->block_group_cache_lock);
-
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
-
- spin_lock(&block_group->space_info->lock);
- list_del(&block_group->list);
- spin_unlock(&block_group->space_info->lock);
- kfree(block_group);
- }
- spin_unlock(&info->block_group_cache_lock);
- mutex_unlock(&info->alloc_mutex);
- return 0;
-}
-
static unsigned long calc_ra(unsigned long start, unsigned long last,
unsigned long nr)
{
@@ -3192,37 +3170,43 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
{
u64 page_start;
u64 page_end;
+ unsigned long first_index;
unsigned long last_index;
unsigned long i;
struct page *page;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct file_ra_state *ra;
- unsigned long total_read = 0;
- unsigned long ra_pages;
struct btrfs_ordered_extent *ordered;
- struct btrfs_trans_handle *trans;
+ unsigned int total_read = 0;
+ unsigned int total_dirty = 0;
+ int ret = 0;
ra = kzalloc(sizeof(*ra), GFP_NOFS);
mutex_lock(&inode->i_mutex);
- i = start >> PAGE_CACHE_SHIFT;
+ first_index = start >> PAGE_CACHE_SHIFT;
last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
- ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
+ /* make sure the dirty trick played by the caller work */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ first_index, last_index);
+ if (ret)
+ goto out_unlock;
file_ra_state_init(ra, inode->i_mapping);
- for (; i <= last_index; i++) {
- if (total_read % ra_pages == 0) {
+ for (i = first_index ; i <= last_index; i++) {
+ if (total_read % ra->ra_pages == 0) {
btrfs_force_ra(inode->i_mapping, ra, NULL, i,
- calc_ra(i, last_index, ra_pages));
+ calc_ra(i, last_index, ra->ra_pages));
}
total_read++;
again:
if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
- goto truncate_racing;
+ BUG_ON(1);
page = grab_cache_page(inode->i_mapping, i);
if (!page) {
+ ret = -ENOMEM;
goto out_unlock;
}
if (!PageUptodate(page)) {
@@ -3231,6 +3215,7 @@ again:
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ ret = -EIO;
goto out_unlock;
}
}
@@ -3251,14 +3236,13 @@ again:
}
set_page_extent_mapped(page);
- /*
- * make sure page_mkwrite is called for this page if userland
- * wants to change it from mmap
- */
- clear_page_dirty_for_io(page);
-
btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (i == first_index)
+ set_extent_bits(io_tree, page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
+
set_page_dirty(page);
+ total_dirty++;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
@@ -3266,347 +3250,1457 @@ again:
}
out_unlock:
- /* we have to start the IO in order to get the ordered extents
- * instantiated. This allows the relocation to code to wait
- * for all the ordered extents to hit the disk.
- *
- * Otherwise, it would constantly loop over the same extents
- * because the old ones don't get deleted until the IO is
- * started
- */
- btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
- WB_SYNC_NONE);
kfree(ra);
- trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
- if (trans) {
- btrfs_end_transaction(trans, BTRFS_I(inode)->root);
- mark_inode_dirty(inode);
- }
mutex_unlock(&inode->i_mutex);
- return 0;
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ return ret;
+}
+
+static int noinline relocate_data_extent(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+ struct extent_map *em;
+
+ em = alloc_extent_map(GFP_NOFS);
+ BUG_ON(!em || IS_ERR(em));
+
+ em->start = extent_key->objectid - offset;
+ em->len = extent_key->offset;
+ em->block_start = extent_key->objectid;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* setup extent map to cheat btrfs_readpage */
+ mutex_lock(&BTRFS_I(reloc_inode)->extent_mutex);
+ while (1) {
+ int ret;
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(reloc_inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+ mutex_unlock(&BTRFS_I(reloc_inode)->extent_mutex);
-truncate_racing:
- vmtruncate(inode, inode->i_size);
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- total_read);
- goto out_unlock;
+ return relocate_inode_pages(reloc_inode, extent_key->objectid - offset,
+ extent_key->offset);
}
-/*
- * The back references tell us which tree holds a ref on a block,
- * but it is possible for the tree root field in the reference to
- * reflect the original root before a snapshot was made. In this
- * case we should search through all the children of a given root
- * to find potential holders of references on a block.
- *
- * Instead, we do something a little less fancy and just search
- * all the roots for a given key/block combination.
- */
-static int find_root_for_ref(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key0,
- int level,
- int file_key,
- struct btrfs_root **found_root,
- u64 bytenr)
-{
- struct btrfs_key root_location;
- struct btrfs_root *cur_root = *found_root;
- struct btrfs_file_extent_item *file_extent;
- u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
- u64 found_bytenr;
- int ret;
+struct btrfs_ref_path {
+ u64 extent_start;
+ u64 nodes[BTRFS_MAX_LEVEL];
+ u64 root_objectid;
+ u64 root_generation;
+ u64 owner_objectid;
+ u64 owner_offset;
+ u32 num_refs;
+ int lowest_level;
+ int current_level;
+};
- root_location.offset = (u64)-1;
- root_location.type = BTRFS_ROOT_ITEM_KEY;
- path->lowest_level = level;
- path->reada = 0;
- while(1) {
- ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
- found_bytenr = 0;
- if (ret == 0 && file_key) {
- struct extent_buffer *leaf = path->nodes[0];
- file_extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, file_extent) ==
- BTRFS_FILE_EXTENT_REG) {
- found_bytenr =
- btrfs_file_extent_disk_bytenr(leaf,
- file_extent);
- }
- } else if (!file_key) {
- if (path->nodes[level])
- found_bytenr = path->nodes[level]->start;
- }
-
- btrfs_release_path(cur_root, path);
-
- if (found_bytenr == bytenr) {
- *found_root = cur_root;
+struct disk_extent {
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 offset;
+ u64 num_bytes;
+};
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ int first_time)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 bytenr;
+ u32 nritems;
+ int level;
+ int ret = 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ mutex_lock(&extent_root->fs_info->alloc_mutex);
+
+ if (first_time) {
+ ref_path->lowest_level = -1;
+ ref_path->current_level = -1;
+ goto walk_up;
+ }
+walk_down:
+ level = ref_path->current_level - 1;
+ while (level >= -1) {
+ u64 parent;
+ if (level < ref_path->lowest_level)
+ break;
+
+ if (level >= 0) {
+ bytenr = ref_path->nodes[level];
+ } else {
+ bytenr = ref_path->extent_start;
+ }
+ BUG_ON(bytenr == 0);
+
+ parent = ref_path->nodes[level + 1];
+ ref_path->nodes[level + 1] = 0;
+ ref_path->current_level = level;
+ BUG_ON(parent == 0);
+
+ key.objectid = bytenr;
+ key.offset = parent + 1;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ goto next;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid == bytenr &&
+ found_key.type == BTRFS_EXTENT_REF_KEY)
+ goto found;
+next:
+ level--;
+ btrfs_release_path(extent_root, path);
+ if (need_resched()) {
+ mutex_unlock(&extent_root->fs_info->alloc_mutex);
+ cond_resched();
+ mutex_lock(&extent_root->fs_info->alloc_mutex);
+ }
+ }
+ /* reached lowest level */
+ ret = 1;
+ goto out;
+walk_up:
+ level = ref_path->current_level;
+ while (level < BTRFS_MAX_LEVEL - 1) {
+ u64 ref_objectid;
+ if (level >= 0) {
+ bytenr = ref_path->nodes[level];
+ } else {
+ bytenr = ref_path->extent_start;
+ }
+ BUG_ON(bytenr == 0);
+
+ key.objectid = bytenr;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level)
+ goto out;
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != bytenr ||
+ found_key.type != BTRFS_EXTENT_REF_KEY) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level) {
+ ret = 1;
+ goto out;
+ }
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+found:
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ ref_objectid = btrfs_ref_objectid(leaf, ref);
+ if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ if (first_time) {
+ level = (int)ref_objectid;
+ BUG_ON(level >= BTRFS_MAX_LEVEL);
+ ref_path->lowest_level = level;
+ ref_path->current_level = level;
+ ref_path->nodes[level] = bytenr;
+ } else {
+ WARN_ON(ref_objectid != level);
+ }
+ } else {
+ WARN_ON(level != -1);
+ }
+ first_time = 0;
+
+ if (ref_path->lowest_level == level) {
+ ref_path->owner_objectid = ref_objectid;
+ ref_path->owner_offset = btrfs_ref_offset(leaf, ref);
+ ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+ }
+
+ /*
+ * the block is tree root or the block isn't in reference
+ * counted tree.
+ */
+ if (found_key.objectid == found_key.offset ||
+ is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ if (level < 0) {
+ /* special reference from the tree log */
+ ref_path->nodes[0] = found_key.offset;
+ ref_path->current_level = 0;
+ }
ret = 0;
goto out;
}
- ret = btrfs_search_root(root->fs_info->tree_root,
- root_search_start, &root_search_start);
- if (ret)
- break;
- root_location.objectid = root_search_start;
- cur_root = btrfs_read_fs_root_no_name(root->fs_info,
- &root_location);
- if (!cur_root) {
- ret = 1;
- break;
+ level++;
+ BUG_ON(ref_path->nodes[level] != 0);
+ ref_path->nodes[level] = found_key.offset;
+ ref_path->current_level = level;
+
+ /*
+ * the reference was created in the running transaction,
+ * no need to continue walking up.
+ */
+ if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ ret = 0;
+ goto out;
+ }
+
+ btrfs_release_path(extent_root, path);
+ if (need_resched()) {
+ mutex_unlock(&extent_root->fs_info->alloc_mutex);
+ cond_resched();
+ mutex_lock(&extent_root->fs_info->alloc_mutex);
}
}
+ /* reached max tree level, but no tree root found. */
+ BUG();
out:
- path->lowest_level = 0;
+ mutex_unlock(&extent_root->fs_info->alloc_mutex);
+ btrfs_free_path(path);
return ret;
}
-/*
- * note, this releases the path
- */
-static int noinline relocate_one_reference(struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct btrfs_key *extent_key,
- u64 *last_file_objectid,
- u64 *last_file_offset,
- u64 *last_file_root,
- u64 last_extent)
-{
- struct inode *inode;
- struct btrfs_root *found_root;
- struct btrfs_key root_location;
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ u64 extent_start)
+{
+ memset(ref_path, 0, sizeof(*ref_path));
+ ref_path->extent_start = extent_start;
+
+ return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path)
+{
+ return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static int noinline get_new_locations(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset, int no_fragment,
+ struct disk_extent **extents,
+ int *nr_extents)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct disk_extent *exts = *extents;
struct btrfs_key found_key;
- struct btrfs_extent_ref *ref;
- u64 ref_root;
- u64 ref_gen;
- u64 ref_objectid;
- u64 ref_offset;
+ u64 cur_pos;
+ u64 last_byte;
+ u32 nritems;
+ int nr = 0;
+ int max = *nr_extents;
int ret;
- int level;
- WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+ WARN_ON(!no_fragment && *extents);
+ if (!exts) {
+ max = 1;
+ exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+ if (!exts)
+ return -ENOMEM;
+ }
- ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_extent_ref);
- ref_root = btrfs_ref_root(path->nodes[0], ref);
- ref_gen = btrfs_ref_generation(path->nodes[0], ref);
- ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
- ref_offset = btrfs_ref_offset(path->nodes[0], ref);
- btrfs_release_path(extent_root, path);
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
- root_location.objectid = ref_root;
- if (ref_gen == 0)
- root_location.offset = 0;
- else
- root_location.offset = (u64)-1;
- root_location.type = BTRFS_ROOT_ITEM_KEY;
+ cur_pos = extent_key->objectid - offset;
+ last_byte = extent_key->objectid + extent_key->offset;
+ ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ cur_pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
- found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
- &root_location);
- BUG_ON(!found_root);
- mutex_unlock(&extent_root->fs_info->alloc_mutex);
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
- if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- found_key.objectid = ref_objectid;
- found_key.type = BTRFS_EXTENT_DATA_KEY;
- found_key.offset = ref_offset;
- level = 0;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.offset != cur_pos ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY ||
+ found_key.objectid != reloc_inode->i_ino)
+ break;
- if (last_extent == extent_key->objectid &&
- *last_file_objectid == ref_objectid &&
- *last_file_offset == ref_offset &&
- *last_file_root == ref_root)
- goto out;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ break;
- ret = find_root_for_ref(extent_root, path, &found_key,
- level, 1, &found_root,
- extent_key->objectid);
+ if (nr == max) {
+ struct disk_extent *old = exts;
+ max *= 2;
+ exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+ memcpy(exts, old, sizeof(*exts) * nr);
+ if (old != *extents)
+ kfree(old);
+ }
- if (ret)
+ exts[nr].disk_bytenr =
+ btrfs_file_extent_disk_bytenr(leaf, fi);
+ exts[nr].disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+ exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ WARN_ON(exts[nr].offset > 0);
+ WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+
+ cur_pos += exts[nr].num_bytes;
+ nr++;
+
+ if (cur_pos + offset >= last_byte)
+ break;
+
+ if (no_fragment) {
+ ret = 1;
goto out;
+ }
+ path->slots[0]++;
+ }
- if (last_extent == extent_key->objectid &&
- *last_file_objectid == ref_objectid &&
- *last_file_offset == ref_offset &&
- *last_file_root == ref_root)
+ WARN_ON(cur_pos + offset > last_byte);
+ if (cur_pos + offset < last_byte) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ if (exts != *extents)
+ kfree(exts);
+ } else {
+ *extents = exts;
+ *nr_extents = nr;
+ }
+ return ret;
+}
+
+static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ struct btrfs_key *leaf_key,
+ struct btrfs_ref_path *ref_path,
+ struct disk_extent *new_extents,
+ int nr_extents)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ struct btrfs_key key;
+ u64 lock_start = 0;
+ u64 lock_end = 0;
+ u64 num_bytes;
+ u64 ext_offset;
+ u64 first_pos;
+ u32 nritems;
+ int extent_locked = 0;
+ int ret;
+
+ first_pos = ref_path->owner_offset;
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+ key.objectid = ref_path->owner_objectid;
+ key.offset = ref_path->owner_offset;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ } else {
+ memcpy(&key, leaf_key, sizeof(key));
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
goto out;
- inode = btrfs_iget_locked(extent_root->fs_info->sb,
- ref_objectid, found_root);
- if (inode->i_state & I_NEW) {
- /* the inode and parent dir are two different roots */
- BTRFS_I(inode)->root = found_root;
- BTRFS_I(inode)->location.objectid = ref_objectid;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
- btrfs_read_locked_inode(inode);
- unlock_new_inode(inode);
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+next:
+ if (extent_locked && ret > 0) {
+ /*
+ * the file extent item was modified by someone
+ * before the extent got locked.
+ */
+ mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);