aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/relocation.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/relocation.c')
-rw-r--r--fs/btrfs/relocation.c3711
1 files changed, 3711 insertions, 0 deletions
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 00000000000..b23dc209ae1
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+ /* objectid tree block owner */
+ u64 owner;
+ /* list of upper level blocks reference this block */
+ struct list_head upper;
+ /* list of child blocks in the cache */
+ struct list_head lower;
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* extent buffer got by COW the block */
+ struct extent_buffer *eb;
+ /* level of tree block */
+ unsigned int level:8;
+ /* 1 if the block is root of old snapshot */
+ unsigned int old_root:1;
+ /* 1 if no child blocks in the cache */
+ unsigned int lowest:1;
+ /* is the extent buffer locked */
+ unsigned int locked:1;
+ /* has the block been processed */
+ unsigned int processed:1;
+ /* have backrefs of this block been checked */
+ unsigned int checked:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+ struct list_head list[2];
+ struct backref_node *node[2];
+ u64 blockptr;
+};
+
+#define LOWER 0
+#define UPPER 1
+
+struct backref_cache {
+ /* red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* list of backref nodes with no child block in the cache */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ spinlock_t lock;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+ void *data;
+};
+
+struct mapping_tree {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+ struct rb_node rb_node;
+ u64 bytenr;
+ struct btrfs_key key;
+ unsigned int level:8;
+ unsigned int key_ready:1;
+};
+
+/* inode vector */
+#define INODEVEC_SIZE 16
+
+struct inodevec {
+ struct list_head list;
+ struct inode *inode[INODEVEC_SIZE];
+ int nr;
+};
+
+struct reloc_control {
+ /* block group to relocate */
+ struct btrfs_block_group_cache *block_group;
+ /* extent tree */
+ struct btrfs_root *extent_root;
+ /* inode for moving data */
+ struct inode *data_inode;
+ struct btrfs_workers workers;
+ /* tree blocks have been processed */
+ struct extent_io_tree processed_blocks;
+ /* map start of tree root to corresponding reloc tree */
+ struct mapping_tree reloc_root_tree;
+ /* list of reloc trees */
+ struct list_head reloc_roots;
+ u64 search_start;
+ u64 extents_found;
+ u64 extents_skipped;
+ int stage;
+ int create_reloc_root;
+ unsigned int found_file_extent:1;
+ unsigned int found_old_snapshot:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS 0
+#define UPDATE_DATA_PTRS 1
+
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+ struct btrfs_work work;
+ struct reloc_control *rc;
+ struct btrfs_root *root;
+ struct completion *done;
+ atomic_t *num_pending;
+};
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+ tree->rb_root.rb_node = NULL;
+ spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+ int i;
+ cache->rb_root.rb_node = NULL;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ spin_lock_init(&cache->lock);
+}
+
+static void backref_node_init(struct backref_node *node)
+{
+ memset(node, 0, sizeof(*node));
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n = root->rb_node;
+ struct tree_entry *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+ struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ int idx = *index;
+
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[idx++] = edge;
+ node = edge->node[UPPER];
+ }
+ *index = idx;
+ return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ struct backref_node *lower;
+ int idx = *index;
+
+ while (idx > 0) {
+ edge = edges[idx - 1];
+ lower = edge->node[LOWER];
+ if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+ idx--;
+ continue;
+ }
+ edge = list_entry(edge->list[LOWER].next,
+ struct backref_edge, list[LOWER]);
+ edges[idx - 1] = edge;
+ *index = idx;
+ return edge->node[UPPER];
+ }
+ *index = 0;
+ return NULL;
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+ if (node->eb) {
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+ struct backref_node *node)
+{
+ BUG_ON(!node->lowest);
+ BUG_ON(!list_empty(&node->upper));
+
+ drop_node_buffer(node);
+ list_del(&node->lower);
+
+ rb_erase(&node->rb_node, &tree->rb_root);
+ kfree(node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ struct backref_node *upper;
+ struct backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ kfree(edge);
+ /*
+ * add the node to pending list if no other
+ * child block cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower,
+ &cache->pending[upper->level]);
+ upper->lowest = 1;
+ }
+ }
+ drop_backref_node(cache, node);
+}
+
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+ u64 bytenr)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct btrfs_root *root = NULL;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ root = (struct btrfs_root *)node->data;
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+
+ key.objectid = root_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ if (is_cowonly_root(root_objectid))
+ key.offset = 0;
+ else
+ key.offset = (u64)-1;
+
+ return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_ref_v0 *ref0)
+{
+ struct btrfs_root *root;
+ u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+ u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+ BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+ root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+ BUG_ON(IS_ERR(root));
+
+ if (root->ref_cows &&
+ generation != btrfs_root_generation(&root->root_item))
+ return NULL;
+
+ return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+ unsigned long *ptr, unsigned long *end)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ u32 item_size;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ return 1;
+ }
+#endif
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+ BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+ if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+ WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ return 1;
+ }
+
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ *ptr = (unsigned long)(bi + 1);
+ *end = (unsigned long)ei + item_size;
+ return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct backref_cache *cache,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
+{
+ struct btrfs_path *path1;
+ struct btrfs_path *path2;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct backref_node *cur;
+ struct backref_node *upper;
+ struct backref_node *lower;
+ struct backref_node *node = NULL;
+ struct backref_node *exist = NULL;
+ struct backref_edge *edge;
+ struct rb_node *rb_node;
+ struct btrfs_key key;
+ unsigned long end;
+ unsigned long ptr;
+ LIST_HEAD(list);
+ int ret;
+ int err = 0;
+
+ path1 = btrfs_alloc_path();
+ path2 = btrfs_alloc_path();
+ if (!path1 || !path2) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ if (!node) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ backref_node_init(node);
+ node->bytenr = bytenr;
+ node->owner = 0;
+ node->level = level;
+ node->lowest = 1;
+ cur = node;
+again:
+ end = 0;
+ ptr = 0;
+ key.objectid = cur->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path1->search_commit_root = 1;
+ path1->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+ 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(!ret || !path1->slots[0]);
+
+ path1->slots[0]--;
+
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * the backref was added previously when processsing
+ * backref of type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ BUG_ON(!list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct backref_edge,
+ list[LOWER]);
+ BUG_ON(!list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * add the upper level block to pending list if we need
+ * check its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ exist = NULL;
+ }
+
+ while (1) {
+ cond_resched();
+ eb = path1->nodes[0];
+
+ if (ptr >= end) {
+ if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(rc->extent_root, path1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ break;
+ eb = path1->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+ if (key.objectid != cur->bytenr) {
+ WARN_ON(exist);
+ break;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ ret = find_inline_backref(eb, path1->slots[0],
+ &ptr, &end);
+ if (ret)
+ goto next;
+ }
+ }
+
+ if (ptr < end) {
+ /* update key for inline back ref */
+ struct btrfs_extent_inline_ref *iref;
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ key.type = btrfs_extent_inline_ref_type(eb, iref);
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ goto next;
+ }
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.objectid == key.offset &&
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(eb, path1->slots[0],
+ struct btrfs_extent_ref_v0);
+ root = find_tree_root(rc, eb, ref0);
+ if (root)
+ cur->root = root;
+ else
+ cur->old_root = 1;
+ break;
+ }
+#else
+ BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+ if (key.objectid == key.offset) {
+ /*
+ * only root blocks of reloc trees use
+ * backref of this type.
+ */
+ root = find_reloc_root(rc, cur->bytenr);
+ BUG_ON(!root);
+ cur->root = root;
+ break;
+ }
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rb_node = tree_search(&cache->rb_root, key.offset);
+ if (!rb_node) {
+ upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ if (!upper) {
+ kfree(edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ backref_node_init(upper);
+ upper->bytenr = key.offset;
+ upper->owner = 0;
+ upper->level = cur->level + 1;
+ /*
+ * backrefs for the upper level block isn't
+ * cached, add the block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ list_add(&edge->list[LOWER], &cur->upper);
+ edge->node[UPPER] = upper;
+ edge->node[LOWER] = cur;
+
+ goto next;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ goto next;
+ }
+
+ /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ root = read_fs_root(rc->extent_root->fs_info, key.offset);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* tree root */
+ BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ cur->bytenr);
+ cur->root = root;
+ break;
+ }
+
+ level = cur->level + 1;
+
+ /*
+ * searching the tree to find upper level blocks
+ * reference the block.
+ */
+ path2->search_commit_root = 1;
+ path2->skip_locking = 1;
+ path2->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+ path2->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ eb = path2->nodes[level];
+ WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+ cur->bytenr);
+
+ lower = cur;
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path2->nodes[level]) {
+ BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ lower->bytenr);
+ lower->root = root;
+ break;
+ }
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ eb = path2->nodes[level];
+ rb_node = tree_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ if (!upper) {
+ kfree(edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ backref_node_init(upper);
+ upper->bytenr = eb->start;
+ upper->owner = btrfs_header_owner(eb);
+ upper->level = lower->level + 1;
+
+ /*
+ * if we know the block isn't shared
+ * we can void checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * add the block to pending list if we
+ * need check its backrefs. only block
+ * at 'cur->level + 1' is added to the
+ * tail of pending list. this guarantees
+ * we check backrefs from lower level
+ * blocks to upper level blocks.
+ */
+ if (!upper->checked &&
+ level == cur->level + 1) {
+ list_add_tail(&edge->list[UPPER],
+ &list);
+ } else
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(!upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ edge->node[UPPER] = upper;
+ edge->node[LOWER] = lower;
+
+ if (rb_node)
+ break;
+ lower = upper;
+ upper = NULL;
+ }
+ btrfs_release_path(root, path2);
+next:
+ if (ptr < end) {
+ ptr += btrfs_extent_inline_ref_size(key.type);
+ if (ptr >= end) {
+ WARN_ON(ptr > end);
+ ptr = 0;
+ end = 0;
+ }
+ }
+ if (ptr >= end)
+ path1->slots[0]++;
+ }
+ btrfs_release_path(rc->extent_root, path1);
+
+ cur->checked = 1;
+ WARN_ON(exist);
+
+ /* the pending list isn't empty, take the first block to process */
+ if (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
+ goto again;
+ }
+
+ /*
+ * everything goes well, connect backref nodes and insert backref nodes
+ * into the cache.
+ */
+ BUG_ON(!node->checked);
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ BUG_ON(rb_node);
+
+ list_for_each_entry(edge, &node->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+
+ while (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ BUG_ON(!upper->checked);
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ BUG_ON(rb_node);
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ }
+out:
+ btrfs_free_path(path1);
+ btrfs_free_path(path2);
+ if (err) {
+ INIT_LIST_HEAD(&list);
+ upper = node;
+ while (upper) {
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ list_splice_tail(&upper->upper, &list);
+ kfree(upper);
+ }
+
+ if (list_empty(&list))
+ break;
+
+ edge = list_entry(list.next, struct backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ kfree(edge);
+ }
+ return ERR_PTR(err);
+ }
+ return node;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ BUG_ON(!node);
+
+ node->bytenr = root->node->start;
+ node->data = root;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ BUG_ON(rb_node);
+
+ list_add_tail(&root->root_list, &rc->reloc_roots);
+ return 0;
+}
+
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ if (!del) {
+ spin_lock(&rc->reloc_root_tree.lock);
+ node->bytenr = root->node->start;
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ BUG_ON(rb_node);
+ } else {
+ list_del_init(&root->root_list);
+ kfree(node);
+ }
+ return 0;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!root->fs_info->reloc_ctl ||
+ !root->fs_info->reloc_ctl->create_reloc_root ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = root->root_key.objectid;
+
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+ memcpy(root_item, &root->root_item, sizeof(*root_item));
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+ memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &root_key);
+ BUG_ON(IS_ERR(reloc_root));
+ reloc_root->last_trans = trans->transid;
+
+ __add_reloc_root(reloc_root);
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ int del = 0;
+ int ret;
+
+ if (!root->reloc_root)
+ return 0;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (btrfs_root_refs(root_item) == 0) {
+ root->reloc_root = NULL;
+ del = 1;
+ }
+
+ __update_reloc_root(reloc_root, del);
+
+ if (reloc_root->commit_root != reloc_root->node) {
+ btrfs_set_root_node(root_item, reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->commit_root = btrfs_root_node(reloc_root);
+ }
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key, root_item);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ objectid = entry->vfs_inode.i_ino + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+ struct btrfs_block_group_cache *block_group)
+{
+ if (bytenr >= block_group->key.objectid &&
+ bytenr < block_group->key.objectid + block_group->key.offset)
+ return 1;
+ return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ bytenr, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi));
+
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (new_bytenr)
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct list_head *inode_list)
+{
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ struct inodevec *ivec = NULL;
+ u64 parent;
+ u64 bytenr;
+ u64 new_bytenr;
+ u64 num_bytes;
+ u64 end;
+ u32 nritems;
+ u32 i;
+ int ret;
+ int first = 1;
+ int dirty = 0;
+
+ if (rc->stage != UPDATE_DATA_PTRS)
+ return 0;
+
+ /* reloc trees always use full backref */
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent = leaf->start;
+ else
+ parent = 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+ if (!in_block_group(bytenr, rc->block_group))
+ continue;
+
+ /*
+ * if we are modifying block in fs tree, wait for readpage
+ * to complete and drop the extent cache
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!ivec || ivec->nr == INODEVEC_SIZE) {
+ ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+ BUG_ON(!ivec);
+ ivec->nr = 0;
+ list_add_tail(&ivec->list, inode_list);
+ }
+ if (first) {
+ inode = find_next_inode(root, key.objectid);
+ if (inode)
+ ivec->inode[ivec->nr++] = inode;
+ first = 0;
+ } else if (inode && inode->i_ino < key.objectid) {
+ inode = find_next_inode(root, key.objectid);
+ if (inode)
+ ivec->inode[ivec->nr++] = inode;
+ }
+ if (inode && inode->i_ino == key.objectid) {
+ end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ WARN_ON(!IS_ALIGNED(key.offset,
+ root->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ end--;
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end,
+ GFP_NOFS);
+ if (!ret)
+ continue;
+
+ btrfs_drop_extent_cache(inode, key.offset, end,
+ 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end, GFP_NOFS);
+ }
+ }
+
+ ret = get_new_location(rc->data_inode, &new_bytenr,
+ bytenr, num_bytes);
+ if (ret > 0)
+ continue;
+ BUG_ON(ret < 0);
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+ dirty = 1;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+ num_bytes, parent,
+ btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ parent, btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ BUG_ON(ret);
+ }
+ if (dirty)
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_disk_key key1;
+ struct btrfs_disk_key key2;
+ btrfs_node_key(eb, &key1, slot);
+ btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+ return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is retur