diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/async-thread.c | 120 | ||||
-rw-r--r-- | fs/btrfs/async-thread.h | 4 | ||||
-rw-r--r-- | fs/btrfs/backref.c | 2 | ||||
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 4 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 17 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 11 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 62 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 223 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 336 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 60 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/file.c | 8 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 82 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 28 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 281 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 23 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 4 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 79 | ||||
-rw-r--r-- | fs/btrfs/super.c | 125 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 12 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 15 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 6 |
22 files changed, 994 insertions, 510 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec14097fef..0b394580d86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,8 @@ struct btrfs_worker_thread { int idle; }; +static int __btrfs_start_workers(struct btrfs_workers *workers); + /* * btrfs_start_workers uses kthread_run, which can block waiting for memory * for a very long time. It will actually throttle on page writeback, @@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work) { struct worker_start *start; start = container_of(work, struct worker_start, work); - btrfs_start_workers(start->queue, 1); + __btrfs_start_workers(start->queue); kfree(start); } -static int start_new_worker(struct btrfs_workers *queue) -{ - struct worker_start *start; - int ret; - - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return -ENOMEM; - - start->work.func = start_new_worker_func; - start->queue = queue; - ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); - if (ret) - kfree(start); - return ret; -} - /* * helper function to move a thread onto the idle list after it * has finished some requests. @@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) static void check_pending_worker_creates(struct btrfs_worker_thread *worker) { struct btrfs_workers *workers = worker->workers; + struct worker_start *start; unsigned long flags; rmb(); if (!workers->atomic_start_pending) return; + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return; + + start->work.func = start_new_worker_func; + start->queue = workers; + spin_lock_irqsave(&workers->lock, flags); if (!workers->atomic_start_pending) goto out; @@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - start_new_worker(workers); + btrfs_queue_worker(workers->atomic_worker_start, &start->work); return; out: + kfree(start); spin_unlock_irqrestore(&workers->lock, flags); } @@ -331,7 +325,7 @@ again: run_ordered_completions(worker->workers, work); check_pending_worker_creates(worker); - + cond_resched(); } spin_lock_irq(&worker->lock); @@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -static int __btrfs_start_workers(struct btrfs_workers *workers, - int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; int ret = 0; - int i; - for (i = 0; i < num_workers; i++) { - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + i); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - kfree(worker); - goto fail; - } - spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + 1); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + kfree(worker); + goto fail; } + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + return 0; fail: - btrfs_stop_workers(workers); + spin_lock_irq(&workers->lock); + workers->num_workers_starting--; + spin_unlock_irq(&workers->lock); return ret; } -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +int btrfs_start_workers(struct btrfs_workers *workers) { spin_lock_irq(&workers->lock); - workers->num_workers_starting += num_workers; + workers->num_workers_starting++; spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers, num_workers); + return __btrfs_start_workers(workers); } /* @@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct btrfs_worker_thread *worker; unsigned long flags; struct list_head *fallback; + int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -584,7 +578,10 @@ again: workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - __btrfs_start_workers(workers, 1); + ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); + if (ret) + goto fallback; goto again; } } @@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work) /* * places a struct btrfs_work into the pending queue of one of the kthreads */ -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) { struct btrfs_worker_thread *worker; unsigned long flags; @@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) /* don't requeue something already on a list */ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; worker = find_worker(workers); if (workers->ordered) { @@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); - -out: - return 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 5077746cf85..f34cc31fa3c 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -109,8 +109,8 @@ struct btrfs_workers { char *name; }; -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad3929..22c64fff1bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->val[i] = (u64)fspath; + ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a393..634608d2a6d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e4ea3..dede441bdee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59ff929..67385033323 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { @@ -1271,6 +1272,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { @@ -2366,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); @@ -2686,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -void btrfs_dirty_inode(struct inode *inode, int flags); +int btrfs_dirty_inode(struct inode *inode); +int btrfs_update_time(struct file *file); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3a1b939c9ae..9c1eccc2c50 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -638,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata( * Now if src_rsv == delalloc_block_rsv we'll let it just steal since * we're accounted for. */ - if (!trans->bytes_reserved && - src_rsv != &root->fs_info->delalloc_block_rsv) { + if (!src_rsv || (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv)) { ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); /* * Since we're under a transaction reserve_metadata_bytes could @@ -652,12 +654,65 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ if (!ret) node->bytes_reserved = num_bytes; + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + return ret; } @@ -1708,7 +1763,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 102c176fc29..f44b3928dc2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); @@ -2199,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_meta_write_workers.idle_thresh = 2; fs_info->readahead_workers.idle_thresh = 2; - btrfs_start_workers(&fs_info->workers, 1); - btrfs_start_workers(&fs_info->generic_worker, 1); - btrfs_start_workers(&fs_info->submit_workers, 1); - btrfs_start_workers(&fs_info->delalloc_workers, 1); - btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); - btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->endio_freespace_worker, 1); - btrfs_start_workers(&fs_info->delayed_workers, 1); - btrfs_start_workers(&fs_info->caching_workers, 1); - btrfs_start_workers(&fs_info->readahead_workers, 1); + /* + * btrfs_start_workers can really only fail because of ENOMEM so just + * return -ENOMEM if any of these fail. + */ + ret = btrfs_start_workers(&fs_info->workers); + ret |= btrfs_start_workers(&fs_info->generic_worker); + ret |= btrfs_start_workers(&fs_info->submit_workers); + ret |= btrfs_start_workers(&fs_info->delalloc_workers); + ret |= btrfs_start_workers(&fs_info->fixup_workers); + ret |= btrfs_start_workers(&fs_info->endio_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); + ret |= btrfs_start_workers(&fs_info->delayed_workers); + ret |= btrfs_start_workers(&fs_info->caching_workers); + ret |= btrfs_start_workers(&fs_info->readahead_workers); + if (ret) { + ret = -ENOMEM; + goto fail_sb_buffer; + } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2465,21 +2468,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; @@ -2579,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2640,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2672,6 +2781,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9879bd47463..f5fbe576d2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root) && btrfs_test_opt(root, SPACE_CACHE)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -2781,7 +2822,7 @@ out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock);< |