diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/acl.c | 6 | ||||
-rw-r--r-- | fs/btrfs/async-thread.c | 81 | ||||
-rw-r--r-- | fs/btrfs/async-thread.h | 10 | ||||
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 16 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 38 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 60 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 724 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 134 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 31 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 2 | ||||
-rw-r--r-- | fs/btrfs/file.c | 81 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 442 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 69 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 99 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.h | 4 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 4 | ||||
-rw-r--r-- | fs/btrfs/root-tree.c | 2 | ||||
-rw-r--r-- | fs/btrfs/super.c | 9 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 72 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 5 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 56 | ||||
-rw-r--r-- | fs/btrfs/tree-log.h | 3 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 4 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 2 |
25 files changed, 1508 insertions, 448 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f128427b995..36160424427 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_FS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 282ca085c2f..c0861e781cd 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,51 @@ struct btrfs_worker_thread { }; /* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* * helper function to move a thread onto the idle list after it * has finished some requests. */ @@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) goto out; workers->atomic_start_pending = 0; - if (workers->num_workers >= workers->max_workers) + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) goto out; + workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - btrfs_start_workers(workers, 1); + start_new_worker(workers); return; out: @@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); @@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->name = name; workers->ordered = 0; workers->atomic_start_pending = 0; - workers->atomic_worker_start = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -452,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -509,15 +572,17 @@ again: worker = next_worker(workers); if (!worker) { - if (workers->num_workers >= workers->max_workers) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { goto fallback; } else if (workers->atomic_worker_start) { workers->atomic_start_pending = 1; goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc089b95ec1..5077746cf85 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -78,9 +80,10 @@ struct btrfs_workers { /* * are we allowed to sleep while starting workers or are we required - * to start them at a later time? + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. */ - int atomic_worker_start; + struct btrfs_workers *atomic_worker_start; /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the @@ -109,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 82ee56bba29..f6783a42f01 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ @@ -128,6 +134,16 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; + + /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80599b4e42b..444b3e9b92a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,21 +675,28 @@ struct btrfs_space_info { current allocations */ u64 bytes_readonly; /* total bytes that are read only */ u64 bytes_super; /* total bytes reserved for the super blocks */ - - /* delalloc accounting */ - u64 bytes_delalloc; /* number of bytes reserved for allocation, - this space is not necessarily reserved yet - by the allocator */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ u64 bytes_may_use; /* number of bytes that may be used for - delalloc */ + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; @@ -903,6 +910,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; @@ -910,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -1000,7 +1009,10 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1141,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2022,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, @@ -2314,7 +2332,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); -extern struct dentry_operations btrfs_dentry_operations; +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2326,7 +2344,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); -extern struct file_operations btrfs_file_operations; +extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, @@ -2357,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 644e796fd64..02b6afbd745 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int btrfs_write_tree_block(struct extent_buffer *buf) { - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, - buf->start + buf->len - 1, WB_SYNC_ALL); + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start, buf->start + buf->len - 1); + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, @@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1087,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } @@ -1630,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - fs_info->metadata_ratio = 8; + fs_info->metadata_ratio = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1746,21 +1748,25 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -EINVAL; goto fail_iput; } -printk("thread pool is %d\n", fs_info->thread_pool_size); - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1774,15 +1780,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1794,12 +1805,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->endio_workers.atomic_worker_start = 1; - fs_info->endio_meta_workers.atomic_worker_start = 1; - fs_info->endio_write_workers.atomic_worker_start = 1; - fs_info->endio_meta_write_workers.atomic_worker_start = 1; - btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); @@ -1807,6 +1814,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2012,6 +2020,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2020,6 +2029,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2437,6 +2447,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2445,6 +2456,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 993f93ff7ba..94627c4cc19 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -1566,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, DISCARD_FL_BARRIER); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); @@ -1602,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2765,67 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); - if (!meta_sinfo) - goto alloc; -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&meta_sinfo->lock); + return 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; + +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; + + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 10gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)10 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->allocate_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->allocate_wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk |