aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c2343
1 files changed, 2343 insertions, 0 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000000..81a313874ae
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include "compat.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete. This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+ struct bio *bio;
+ bio_end_io_t *end_io;
+ void *private;
+ struct btrfs_fs_info *info;
+ int error;
+ int metadata;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads. They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+ struct inode *inode;
+ struct bio *bio;
+ struct list_head list;
+ extent_submit_bio_hook_t *submit_bio_start;
+ extent_submit_bio_hook_t *submit_bio_done;
+ int rw;
+ int mirror_num;
+ unsigned long bio_flags;
+ struct btrfs_work work;
+};
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+ struct page *page, size_t page_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ em->bdev =
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ spin_unlock(&em_tree->lock);
+ goto out;
+ }
+ spin_unlock(&em_tree->lock);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ em->start = 0;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+ em->block_start = 0;
+ em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (ret == -EEXIST) {
+ u64 failed_start = em->start;
+ u64 failed_len = em->len;
+
+ free_extent_map(em);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ ret = 0;
+ } else {
+ em = lookup_extent_mapping(em_tree, failed_start,
+ failed_len);
+ ret = -EIO;
+ }
+ } else if (ret) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ spin_unlock(&em_tree->lock);
+
+ if (ret)
+ em = ERR_PTR(ret);
+out:
+ return em;
+}
+
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+ return btrfs_crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+ *(__le32 *)result = ~cpu_to_le32(crc);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+ int verify)
+{
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ char *result = NULL;
+ unsigned long len;
+ unsigned long cur_len;
+ unsigned long offset = BTRFS_CSUM_SIZE;
+ char *map_token = NULL;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ int err;
+ u32 crc = ~(u32)0;
+ unsigned long inline_result;
+
+ len = buf->len - offset;
+ while (len > 0) {
+ err = map_private_extent_buffer(buf, offset, 32,
+ &map_token, &kaddr,
+ &map_start, &map_len, KM_USER0);
+ if (err)
+ return 1;
+ cur_len = min(len, map_len - (offset - map_start));
+ crc = btrfs_csum_data(root, kaddr + offset - map_start,
+ crc, cur_len);
+ len -= cur_len;
+ offset += cur_len;
+ unmap_extent_buffer(buf, map_token, KM_USER0);
+ }
+ if (csum_size > sizeof(inline_result)) {
+ result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+ if (!result)
+ return 1;
+ } else {
+ result = (char *)&inline_result;
+ }
+
+ btrfs_csum_final(crc, result);
+
+ if (verify) {
+ if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+ u32 val;
+ u32 found = 0;
+ memcpy(&found, result, csum_size);
+
+ read_extent_buffer(buf, &val, 0, csum_size);
+ printk(KERN_INFO "btrfs: %s checksum verify failed "
+ "on %llu wanted %X found %X level %d\n",
+ root->fs_info->sb->s_id,
+ buf->start, val, found, btrfs_header_level(buf));
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 1;
+ }
+ } else {
+ write_extent_buffer(buf, result, 0, csum_size);
+ }
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer. This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+ struct extent_buffer *eb, u64 parent_transid)
+{
+ int ret;
+
+ if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+ return 0;
+
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+ if (extent_buffer_uptodate(io_tree, eb) &&
+ btrfs_header_generation(eb) == parent_transid) {
+ ret = 0;
+ goto out;
+ }
+ printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+ (unsigned long long)eb->start,
+ (unsigned long long)parent_transid,
+ (unsigned long long)btrfs_header_generation(eb));
+ ret = 1;
+ clear_extent_buffer_uptodate(io_tree, eb);
+out:
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+ struct extent_buffer *eb,
+ u64 start, u64 parent_transid)
+{
+ struct extent_io_tree *io_tree;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+
+ io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ while (1) {
+ ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ btree_get_extent, mirror_num);
+ if (!ret &&
+ !verify_parent_transid(io_tree, eb, parent_transid))
+ return ret;
+
+ num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ eb->start, eb->len);
+ if (num_copies == 1)
+ return ret;
+
+ mirror_num++;
+ if (mirror_num > num_copies)
+ return ret;
+ }
+ return -EIO;
+}
+
+/*
+ * checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+ struct extent_io_tree *tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+ btrfs_header_generation(eb));
+ BUG_ON(ret);
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (eb->first_page != page) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (!PageUptodate(page)) {
+ WARN_ON(1);
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ csum_tree_block(root, eb, 0);
+err:
+ free_extent_buffer(eb);
+out:
+ return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ u8 fsid[BTRFS_UUID_SIZE];
+ int ret = 1;
+
+ read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+ BTRFS_FSID_SIZE);
+ while (fs_devices) {
+ if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ ret = 0;
+ break;
+ }
+ fs_devices = fs_devices->seed;
+ }
+ return ret;
+}
+
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ int ret = 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+ (unsigned long long)found_start,
+ (unsigned long long)eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ if (eb->first_page != page) {
+ printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+ eb->first_page->index, page->index);
+ WARN_ON(1);
+ ret = -EIO;
+ goto err;
+ }
+ if (check_tree_block_fsid(root, eb)) {
+ printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ (unsigned long long)eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ ret = csum_tree_block(root, eb, 1);
+ if (ret)
+ ret = -EIO;
+
+ end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+ end = eb->start + end - 1;
+err:
+ free_extent_buffer(eb);
+out:
+ return ret;
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+ struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_fs_info *fs_info;
+
+ fs_info = end_io_wq->info;
+ end_io_wq->error = err;
+ end_io_wq->work.func = end_workqueue_fn;
+ end_io_wq->work.flags = 0;
+
+ if (bio->bi_rw & (1 << BIO_RW)) {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_write_workers,
+ &end_io_wq->work);
+ } else {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_workers,
+ &end_io_wq->work);
+ }
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ int metadata)
+{
+ struct end_io_wq *end_io_wq;
+ end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ if (!end_io_wq)
+ return -ENOMEM;
+
+ end_io_wq->private = bio->bi_private;
+ end_io_wq->end_io = bio->bi_end_io;
+ end_io_wq->info = info;
+ end_io_wq->error = 0;
+ end_io_wq->bio = bio;
+ end_io_wq->metadata = metadata;
+
+ bio->bi_private = end_io_wq;
+ bio->bi_end_io = end_workqueue_bio;
+ return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+ unsigned long limit = min_t(unsigned long,
+ info->workers.max_workers,
+ info->fs_devices->open_devices);
+ return 256 * limit;
+}
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+ return atomic_read(&info->nr_async_bios) >
+ btrfs_async_submit_limit(info);
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+ async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+ int limit;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+ atomic_dec(&fs_info->nr_async_submits);
+
+ if (atomic_read(&fs_info->nr_async_submits) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ async->submit_bio_done(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
+{
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->inode = inode;
+ async->rw = rw;
+ async->bio = bio;
+ async->mirror_num = mirror_num;
+ async->submit_bio_start = submit_bio_start;
+ async->submit_bio_done = submit_bio_done;
+
+ async->work.func = run_one_async_start;
+ async->work.ordered_func = run_one_async_done;
+ async->work.ordered_free = run_one_async_free;
+
+ async->work.flags = 0;
+ async->bio_flags = bio_flags;
+
+ atomic_inc(&fs_info->nr_async_submits);
+ btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
+ int limit = btrfs_async_submit_limit(fs_info);
+ if (atomic_read(&fs_info->nr_async_submits) > limit) {
+ wait_event_timeout(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) < limit),
+ HZ/10);
+
+ wait_event_timeout(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_bios) < limit),
+ HZ/10);
+ }
+#endif
+ while (atomic_read(&fs_info->async_submit_draining) &&
+ atomic_read(&fs_info->nr_async_submits)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0));
+ }
+
+ return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ struct btrfs_root *root;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ while (bio_index < bio->bi_vcnt) {
+ root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+ csum_dirty_buffer(root, bvec->bv_page);
+ bio_index++;
+ bvec++;
+ }
+ return 0;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ btree_csum_one_bio(bio);
+ return 0;
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ int ret;
+
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ BUG_ON(ret);
+
+ if (!(rw & (1 << BIO_RW))) {
+ /*
+ * called for a read, do the setup so that checksum validation
+ * can happen in the async kernel threads
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ }
+ /*
+ * kthread helpers are used to submit writes so that checksumming
+ * can happen in parallel across all CPUs
+ */
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (current->flags & PF_MEMALLOC) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+
+static int btree_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ u64 num_dirty;
+ u64 start = 0;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ num_dirty = count_range_bits(tree, &start, (u64)-1,
+ thresh, EXTENT_DIRTY);
+ if (num_dirty < thresh)
+ return 0;
+ }
+ return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btree_get_extent);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+
+ ret = try_release_extent_state(map, tree, page, gfp_flags);
+ if (!ret)
+ return 0;
+
+ ret = try_release_extent_buffer(tree, page);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+
+ return ret;
+}
+
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ extent_invalidatepage(tree, page, offset);
+ btree_releasepage(page, GFP_NOFS);
+ if (PagePrivate(page)) {
+ printk(KERN_WARNING "btrfs warning page private not zero "
+ "on page %llu\n", (unsigned long long)page_offset(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+#if 0
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct buffer_head *bh;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct buffer_head *head;
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (buffer_dirty(bh))
+ csum_tree_block(root, bh, 0);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return block_write_full_page(page, btree_get_block, wbc);
+}
+#endif
+
+static struct address_space_operations btree_aops = {
+ .readpage = btree_readpage,
+ .writepage = btree_writepage,
+ .writepages = btree_writepages,
+ .releasepage = btree_releasepage,
+ .invalidatepage = btree_invalidatepage,
+ .sync_page = block_sync_page,
+};
+
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ int ret = 0;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+ read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+ buf, 0, 0, btree_get_extent, 0);
+ free_extent_buffer(buf);
+ return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, GFP_NOFS);
+ return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+
+ eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, NULL, GFP_NOFS);
+ return eb;
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1, WB_SYNC_ALL);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree;
+ int ret;
+
+ io_tree = &BTRFS_I(btree_inode)->io_tree;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return NULL;
+
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+
+ if (ret == 0)
+ buf->flags |= EXTENT_UPTODATE;
+ else
+ WARN_ON(1);
+ return buf;
+
+}
+
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ if (btrfs_header_generation(buf) ==
+ root->fs_info->running_transaction->transid) {
+ WARN_ON(!btrfs_tree_locked(buf));
+ clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ }
+ return 0;
+}
+
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ root->node = NULL;
+ root->commit_root = NULL;
+ root->ref_tree = NULL;
+ root->sectorsize = sectorsize;
+ root->nodesize = nodesize;
+ root->leafsize = leafsize;
+ root->stripesize = stripesize;
+ root->ref_cows = 0;
+ root->track_dirty = 0;
+
+ root->fs_info = fs_info;
+ root->objectid = objectid;
+ root->last_trans = 0;
+ root->highest_inode = 0;
+ root->last_inode_alloc = 0;
+ root->name = NULL;
+ root->in_sysfs = 0;
+
+ INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->orphan_list);
+ INIT_LIST_HEAD(&root->dead_list);
+ spin_lock_init(&root->node_lock);
+ spin_lock_init(&root->list_lock);
+ mutex_init(&root->objectid_mutex);
+ mutex_init(&root->log_mutex);
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+ btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+ root->ref_tree = &root->ref_tree_struct;
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+ root->defrag_trans_start = fs_info->generation;
+ init_completion(&root->kobj_unregister);
+ root->defrag_running = 0;
+ root->defrag_level = 0;
+ root->root_key.objectid = objectid;
+ root->anon_super.s_root = NULL;
+ root->anon_super.s_dev = 0;
+ INIT_LIST_HEAD(&root->anon_super.s_list);
+ INIT_LIST_HEAD(&root->anon_super.s_instances);
+ init_rwsem(&root->anon_super.s_umount);
+
+ return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
+{
+ int ret;
+ u32 blocksize;
+ u64 generation;
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, objectid);
+ ret = btrfs_find_last_root(tree_root, objectid,
+ &root->root_item, &root->root_key);
+ BUG_ON(ret);
+
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ BUG_ON(!root->node);
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ u64 start = 0;
+ u64 end = 0;
+ int ret;
+
+ if (!log_root_tree)
+ return 0;
+
+ while (1) {
+ ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ clear_extent_dirty(&log_root_tree->dirty_log_pages,
+ start, end, GFP_NOFS);
+ }
+ eb = fs_info->log_root_tree->node;
+
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) != 0);
+
+ ret = btrfs_free_reserved_extent(fs_info->tree_root,
+ eb->start, eb->len);
+ BUG_ON(ret);
+
+ free_extent_buffer(eb);
+ kfree(fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return -ENOMEM;
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+ root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ root->ref_cows = 0;
+
+ root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+ 0, BTRFS_TREE_LOG_OBJECTID,
+ trans->transid, 0, 0, 0);
+
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_bytenr(root->node, root->node->start);
+ btrfs_set_header_generation(root->node, trans->transid);
+ btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+
+ write_extent_buffer(root->node, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(root->node),
+ BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(root->node);
+ btrfs_tree_unlock(root->node);
+ fs_info->log_root_tree = root;
+ return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ u64 highest_inode;
+ u64 generation;
+ u32 blocksize;
+ int ret = 0;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+ if (location->offset == (u64)-1) {
+ ret = find_and_setup_root(tree_root, fs_info,
+ location->objectid, root);
+ if (ret) {
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ goto insert;
+ }
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, location->objectid);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ if (ret) {
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ BUG_ON(!root->node);
+insert:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+ root->ref_cows = 1;
+ ret = btrfs_find_highest_inode(root, &highest_inode);
+ if (ret == 0) {
+ root->highest_inode = highest_inode;
+ root->last_inode_alloc = highest_inode;
+ }
+ }
+ return root;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_root *root;
+
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_objectid);
+ return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+ if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return fs_info->chunk_root;
+ if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+ return fs_info->dev_root;
+ if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return fs_info->csum_root;
+
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)location->objectid);
+ if (root)
+ return root;
+
+ root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ set_anon_super(&root->anon_super, NULL);
+
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid, root);
+ BUG_ON(ret);
+ btrfs_orphan_cleanup(root);
+ }
+ return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ root = btrfs_read_fs_root_no_name(fs_info, location);
+ if (!root)
+ return NULL;
+
+ if (root->in_sysfs)
+ return root;
+
+ ret = btrfs_set_root_name(root, name, namelen);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+#if 0
+ ret = btrfs_sysfs_add_root(root);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root->name);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+#endif
+ root->in_sysfs = 1;
+ return root;
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+ struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+ int ret = 0;
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+#if 0
+ if ((bdi_bits & (1 << BDI_write_congested)) &&
+ btrfs_congested_async(info, 0))
+ return 1;
+#endif
+ list_for_each(cur, &info->fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (!device->bdev)
+ continue;
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi && bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct btrfs_fs_info *info;
+
+ info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+ list_for_each(cur, &info->fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (!device->bdev)
+ continue;
+
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi->unplug_io_fn)
+ bdi->unplug_io_fn(bdi, page);
+ }
+}
+
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct inode *inode;
+ struct extent