aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c120
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c62
-rw-r--r--fs/btrfs/disk-io.c223
-rw-r--r--fs/btrfs/extent-tree.c336
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/free-space-cache.c82
-rw-r--r--fs/btrfs/inode-map.c28
-rw-r--r--fs/btrfs/inode.c281
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/scrub.c79
-rw-r--r--fs/btrfs/super.c125
-rw-r--r--fs/btrfs/transaction.c12
-rw-r--r--fs/btrfs/volumes.c15
-rw-r--r--fs/btrfs/volumes.h6
22 files changed, 994 insertions, 510 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef..0b394580d86 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
int idle;
};
+static int __btrfs_start_workers(struct btrfs_workers *workers);
+
/*
* btrfs_start_workers uses kthread_run, which can block waiting for memory
* for a very long time. It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
{
struct worker_start *start;
start = container_of(work, struct worker_start, work);
- btrfs_start_workers(start->queue, 1);
+ __btrfs_start_workers(start->queue);
kfree(start);
}
-static int start_new_worker(struct btrfs_workers *queue)
-{
- struct worker_start *start;
- int ret;
-
- start = kzalloc(sizeof(*start), GFP_NOFS);
- if (!start)
- return -ENOMEM;
-
- start->work.func = start_new_worker_func;
- start->queue = queue;
- ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
- if (ret)
- kfree(start);
- return ret;
-}
-
/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
struct btrfs_workers *workers = worker->workers;
+ struct worker_start *start;
unsigned long flags;
rmb();
if (!workers->atomic_start_pending)
return;
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return;
+
+ start->work.func = start_new_worker_func;
+ start->queue = workers;
+
spin_lock_irqsave(&workers->lock, flags);
if (!workers->atomic_start_pending)
goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
workers->num_workers_starting += 1;
spin_unlock_irqrestore(&workers->lock, flags);
- start_new_worker(workers);
+ btrfs_queue_worker(workers->atomic_worker_start, &start->work);
return;
out:
+ kfree(start);
spin_unlock_irqrestore(&workers->lock, flags);
}
@@ -331,7 +325,7 @@ again:
run_ordered_completions(worker->workers, work);
check_pending_worker_creates(worker);
-
+ cond_resched();
}
spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
-static int __btrfs_start_workers(struct btrfs_workers *workers,
- int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
- int i;
- for (i = 0; i < num_workers; i++) {
- worker = kzalloc(sizeof(*worker), GFP_NOFS);
- if (!worker) {
- ret = -ENOMEM;
- goto fail;
- }
+ worker = kzalloc(sizeof(*worker), GFP_NOFS);
+ if (!worker) {
+ ret = -ENOMEM;
+ goto fail;
+ }
- INIT_LIST_HEAD(&worker->pending);
- INIT_LIST_HEAD(&worker->prio_pending);
- INIT_LIST_HEAD(&worker->worker_list);
- spin_lock_init(&worker->lock);
-
- atomic_set(&worker->num_pending, 0);
- atomic_set(&worker->refs, 1);
- worker->workers = workers;
- worker->task = kthread_run(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + i);
- if (IS_ERR(worker->task)) {
- ret = PTR_ERR(worker->task);
- kfree(worker);
- goto fail;
- }
- spin_lock_irq(&workers->lock);
- list_add_tail(&worker->worker_list, &workers->idle_list);
- worker->idle = 1;
- workers->num_workers++;
- workers->num_workers_starting--;
- WARN_ON(workers->num_workers_starting < 0);
- spin_unlock_irq(&workers->lock);
+ INIT_LIST_HEAD(&worker->pending);
+ INIT_LIST_HEAD(&worker->prio_pending);
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
+
+ atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
+ worker->workers = workers;
+ worker->task = kthread_run(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+ workers->num_workers + 1);
+ if (IS_ERR(worker->task)) {
+ ret = PTR_ERR(worker->task);
+ kfree(worker);
+ goto fail;
}
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
+ worker->idle = 1;
+ workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
+ spin_unlock_irq(&workers->lock);
+
return 0;
fail:
- btrfs_stop_workers(workers);
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting--;
+ spin_unlock_irq(&workers->lock);
return ret;
}
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+int btrfs_start_workers(struct btrfs_workers *workers)
{
spin_lock_irq(&workers->lock);
- workers->num_workers_starting += num_workers;
+ workers->num_workers_starting++;
spin_unlock_irq(&workers->lock);
- return __btrfs_start_workers(workers, num_workers);
+ return __btrfs_start_workers(workers);
}
/*
@@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
struct btrfs_worker_thread *worker;
unsigned long flags;
struct list_head *fallback;
+ int ret;
-again:
spin_lock_irqsave(&workers->lock, flags);
+again:
worker = next_worker(workers);
if (!worker) {
@@ -584,7 +578,10 @@ again:
workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
- __btrfs_start_workers(workers, 1);
+ ret = __btrfs_start_workers(workers);
+ spin_lock_irqsave(&workers->lock, flags);
+ if (ret)
+ goto fallback;
goto again;
}
}
@@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
/*
* places a struct btrfs_work into the pending queue of one of the kthreads
*/
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
@@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
/* don't requeue something already on a list */
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- goto out;
+ return;
worker = find_worker(workers);
if (workers->ordered) {
@@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-
-out:
- return 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85..f34cc31fa3c 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
char *name;
};
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929..22c64fff1bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
return PTR_ERR(fspath);
if (fspath > fspath_min) {
- ipath->fspath->val[i] = (u64)fspath;
+ ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a393..634608d2a6d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
* the btrfs file release call will add this inode to the
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
- *
- * yes, its silly to have a single bitflag, but we might grow more
- * of these.
*/
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;
+ unsigned delalloc_meta_reserved:1;
/*
* always compress this one file
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea3..dede441bdee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+ /* ensure we can see the force_cow */
+ smp_rmb();
+
+ /*
+ * We do not need to cow a block if
+ * 1) this block is not created or changed in this transaction;
+ * 2) this block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * What is forced COW:
+ * when we create snapshot during commiting the transaction,
+ * after we've finished coping src root, we must COW the shared
+ * block to ensure the metadata consistency.
+ */
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+ !root->force_cow)
return 0;
return 1;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff929..67385033323 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO = 0,
BTRFS_CACHE_STARTED = 1,
- BTRFS_CACHE_FINISHED = 2,
+ BTRFS_CACHE_FAST = 2,
+ BTRFS_CACHE_FINISHED = 3,
};
enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
* for stat. It may be used for more later
*/
dev_t anon_dev;
+
+ int force_cow;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
@@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode, int flags);
+int btrfs_dirty_inode(struct inode *inode);
+int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3a1b939c9ae..9c1eccc2c50 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
+ int release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -638,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
* Now if src_rsv == delalloc_block_rsv we'll let it just steal since
* we're accounted for.
*/
- if (!trans->bytes_reserved &&
- src_rsv != &root->fs_info->delalloc_block_rsv) {
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv != &root->fs_info->delalloc_block_rsv)) {
ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
/*
* Since we're under a transaction reserve_metadata_bytes could
@@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata(
if (!ret)
node->bytes_reserved = num_bytes;
return ret;
+ } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (BTRFS_I(inode)->delalloc_meta_reserved) {
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ release = true;
+ goto migrate;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /* Ok we didn't have space pre-reserved. This shouldn't happen
+ * too often but it can happen if we do delalloc to an existing
+ * inode which gets dirtied because of the time update, and then
+ * isn't touched again until after the transaction commits and
+ * then we try to write out the data. First try to be nice and
+ * reserve something strictly for us. If not be a pain and try
+ * to steal from the delalloc block rsv.
+ */
+ ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret)
+ goto out;
+
+ /*
+ * Ok this is a problem, let's just steal from the global rsv
+ * since this really shouldn't happen that often.
+ */
+ WARN_ON(1);
+ ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+ dst_rsv, num_bytes);
+ goto out;
}
+migrate:
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+ /*
+ * Migrate only takes a reservation, it doesn't touch the size of the
+ * block_rsv. This is to simplify people who don't normally have things
+ * migrated from their block rsv. If they go to release their
+ * reservation, that will decrease the size as well, so if migrate
+ * reduced size we'd end up with a negative size. But for the
+ * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+ * but we could in fact do this reserve/migrate dance several times
+ * between the time we did the original reservation and we'd clean it
+ * up. So to take care of this, release the space for the meta
+ * reservation here. I think it may be time for a documentation page on
+ * how block rsvs. work.
+ */
if (!ret)
node->bytes_reserved = num_bytes;
+ if (release)
+ btrfs_block_rsv_release(root, src_rsv, num_bytes);
+
return ret;
}
@@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+ delayed_node);
if (ret)
goto release_node;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 102c176fc29..f44b3928dc2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
static int btree_io_failed_hook(struct bio *failed_bio,
struct page *page, u64 start, u64 end,
- u64 mirror_num, struct extent_state *state)
+ int mirror_num, struct extent_state *state)
{
struct extent_io_tree *tree;
unsigned long len;
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
u64 features;
struct btrfs_key location;
struct buffer_head *bh;
- struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_super_block *disk_super;
struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = NULL;
- struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
- struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *extent_root;
+ struct btrfs_root *csum_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
struct btrfs_root *log_tree_root;
-
int ret;
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
- struct btrfs_super_block *disk_super;
+ extent_root = fs_info->extent_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ csum_root = fs_info->csum_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ chunk_root = fs_info->chunk_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ dev_root = fs_info->dev_root =
+ kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!extent_root || !tree_root || !tree_root->fs_info ||
- !chunk_root || !dev_root || !csum_root) {
+ if (!extent_root || !csum_root || !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
- fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
- fs_info->tree_root = tree_root;
- fs_info->extent_root = extent_root;
- fs_info->csum_root = csum_root;
- fs_info->chunk_root = chunk_root;
- fs_info->dev_root = dev_root;
- fs_info->fs_devices = fs_devices;
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
@@ -2199,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_meta_write_workers.idle_thresh = 2;
fs_info->readahead_workers.idle_thresh = 2;
- btrfs_start_workers(&fs_info->workers, 1);
- btrfs_start_workers(&fs_info->generic_worker, 1);
- btrfs_start_workers(&fs_info->submit_workers, 1);
- btrfs_start_workers(&fs_info->delalloc_workers, 1);
- btrfs_start_workers(&fs_info->fixup_workers, 1);
- btrfs_start_workers(&fs_info->endio_workers, 1);
- btrfs_start_workers(&fs_info->endio_meta_workers, 1);
- btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
- btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
- btrfs_start_workers(&fs_info->delayed_workers, 1);
- btrfs_start_workers(&fs_info->caching_workers, 1);
- btrfs_start_workers(&fs_info->readahead_workers, 1);
+ /*
+ * btrfs_start_workers can really only fail because of ENOMEM so just
+ * return -ENOMEM if any of these fail.
+ */
+ ret = btrfs_start_workers(&fs_info->workers);
+ ret |= btrfs_start_workers(&fs_info->generic_worker);
+ ret |= btrfs_start_workers(&fs_info->submit_workers);
+ ret |= btrfs_start_workers(&fs_info->delalloc_workers);
+ ret |= btrfs_start_workers(&fs_info->fixup_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+ ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+ ret |= btrfs_start_workers(&fs_info->delayed_workers);
+ ret |= btrfs_start_workers(&fs_info->caching_workers);
+ ret |= btrfs_start_workers(&fs_info->readahead_workers);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail_sb_buffer;
+ }
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2465,21 +2468,20 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
fail_iput:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
-
- btrfs_close_devices(fs_info->fs_devices);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
+ btrfs_close_devices(fs_info->fs_devices);
free_fs_info(fs_info);
return ERR_PTR(err);
recovery_tree_root:
-
if (!btrfs_test_opt(tree_root, RECOVERY))
goto fail_tree_roots;
@@ -2579,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
int errors = 0;
u32 crc;
u64 bytenr;
- int last_barrier = 0;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
- /* make sure only the last submit_bh does a barrier */
- if (do_barriers) {
- for (i = 0; i < max_mirrors; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- device->total_bytes)
- break;
- last_barrier = i;
- }
- }
-
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2640,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
bh->b_end_io = btrfs_end_buffer_write_sync;
}
- if (i == last_barrier && do_barriers)
- ret = submit_bh(WRITE_FLUSH_FUA, bh);
- else
- ret = submit_bh(WRITE_SYNC, bh);
-
+ /*
+ * we fua the first super. The others we allow
+ * to go down lazy.
+ */
+ ret = submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
return errors < i ? 0 : -1;
}
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices. If you pass wait == 0, the flushes are
+ * sent down. With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ if (device->nobarriers)
+ return 0;
+
+ if (wait) {
+ bio = device->flush_bio;
+ if (!bio)
+ return 0;
+
+ wait_for_completion(&device->flush_wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+ printk("btrfs: disabling barriers on dev %s\n",
+ device->name);
+ device->nobarriers = 1;
+ }
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
+ ret = -EIO;
+ }
+
+ /* drop the reference from the wait == 0 run */
+ bio_put(bio);
+ device->flush_bio = NULL;
+
+ return ret;
+ }
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ device->flush_bio = NULL;;
+ bio = bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
+ device->flush_bio = bio;
+
+ bio_get(bio);
+ submit_bio(WRITE_FLUSH, bio);
+
+ return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ int errors = 0;
+ int ret;
+
+ /* send down all the barriers */
+ head = &info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 0);
+ if (ret)
+ errors++;
+ }
+
+ /* wait for all the barriers */
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev) {
+ errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 1);
+ if (ret)
+ errors++;
+ }
+ if (errors)
+ return -EIO;
+ return 0;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2672,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
+
+ if (do_barriers)
+ barrier_all_devices(root->fs_info);
+
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9879bd47463..f5fbe576d2b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_root *root,
int load_cache_only)
{
+ DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
int ret = 0;
- smp_mb();
- if (cache->cached != BTRFS_CACHE_NO)
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ atomic_set(&caching_ctl->count, 1);
+ caching_ctl->work.func = caching_thread;
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
/*
* We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
if (trans && (!trans->transaction->in_commit) &&
(root && root != root->fs_info->tree_root) &&
btrfs_test_opt(root, SPACE_CACHE)) {
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- return 0;
- }
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
-
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
if (ret == 1) {
+ cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
} else {
- cache->cached = BTRFS_CACHE_NO;
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
}
spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
if (ret == 1) {
+ put_caching_control(caching_ctl);
free_excluded_extents(fs_info->extent_root, cache);
return 0;
}
+ } else {
+ /*
+ * We are not going to do the fast caching, set cached to the
+ * appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
}
- if (load_cache_only)
- return 0;
-
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
-
- INIT_LIST_HEAD(&caching_ctl->list);
- mutex_init(&caching_ctl->mutex);
- init_waitqueue_head(&caching_ctl->wait);
- caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
- /* one for caching kthread, one for caching block group list */
- atomic_set(&caching_ctl->count, 2);
- caching_ctl->work.func = caching_thread;
-
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- kfree(caching_ctl);
+ if (load_cache_only) {
+ put_caching_control(caching_ctl);
return 0;
}
- cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
down_write(&fs_info->extent_commit_sem);
+ atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->extent_commit_sem);
@@ -2781,7 +2822,7 @@ out_free:
btrfs_release_path(path);
out:
spin_lock(&block_group->lock);<