diff options
Diffstat (limited to 'fs/btrfs')
92 files changed, 73082 insertions, 18068 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c020e57..a66768ebc8d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,16 +1,26 @@ config BTRFS_FS - tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" - depends on EXPERIMENTAL - select LIBCRC32C + tristate "Btrfs filesystem support" + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS + select RAID6_PQ + select XOR_BLOCKS + help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. + + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. + For more information, please see the web pages at + http://btrfs.wiki.kernel.org. To compile this file system support as a module, choose M here. The module will be called btrfs. @@ -29,3 +39,52 @@ config BTRFS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N + +config BTRFS_FS_RUN_SANITY_TESTS + bool "Btrfs will run sanity tests upon loading" + depends on BTRFS_FS + help + This will run some basic sanity tests on the free space cache + code to make sure it is acting as it should. These are mostly + regression tests and are only really interesting to btrfs + developers. + + If unsure, say N. + +config BTRFS_DEBUG + bool "Btrfs debugging support" + depends on BTRFS_FS + help + Enable run-time debugging support for the btrfs filesystem. This may + enable additional and expensive checks with negative impact on + performance, or export extra information via sysfs. + + If unsure, say N. + +config BTRFS_ASSERT + bool "Btrfs assert support" + depends on BTRFS_FS + help + Enable run-time assertion checking. This will result in panics if + any of the assertions trip. This is meant for btrfs developers only. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a35eb36b32f..6d1d0b93b1a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,14 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o delayed-ref.o relocation.o + export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o props.o hash.o + +btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o + +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ + tests/extent-buffer-tests.o tests/btrfs-tests.o \ + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ef7b26724e..9a0124a9585 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -28,19 +28,13 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - -static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; const char *name; char *value = NULL; struct posix_acl *acl; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -58,67 +52,42 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); - return acl; -} - -static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int ret = 0; - - acl = btrfs_get_acl(dentry->d_inode, type); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); - posix_acl_release(acl); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); - return ret; + return acl; } /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct btrfs_trans_handle *trans, +static int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; char *value = NULL; - mode_t mode; - - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &mode); + ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; + if (ret == 0) + acl = NULL; } ret = 0; break; @@ -139,7 +108,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) goto out; } @@ -154,44 +123,9 @@ out: return ret; } -static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - int ret; - struct posix_acl *acl = NULL; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (acl == NULL) { - value = NULL; - size = 0; - } else if (IS_ERR(acl)) { - return PTR_ERR(acl); - } - } - - ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); - - posix_acl_release(acl); - - return ret; -} - -int btrfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int error = -EAGAIN; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - - return error; + return __btrfs_set_acl(NULL, inode, acl, type); } /* @@ -202,111 +136,31 @@ int btrfs_check_acl(struct inode *inode, int mask) int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int ret = 0; /* this happens with subvols */ if (!dir) return 0; - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; - if (!acl) - inode->i_mode &= ~current_umask(); + if (default_acl) { + ret = __btrfs_set_acl(trans, inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - if (IS_POSIXACL(dir) && acl) { - struct posix_acl *clone; - mode_t mode; - - if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(trans, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto failed; - } - clone = posix_acl_clone(acl, GFP_NOFS); - ret = -ENOMEM; - if (!clone) - goto failed; - - mode = inode->i_mode; - ret = posix_acl_create_masq(clone, &mode); - if (ret >= 0) { - inode->i_mode = mode; - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, clone, - ACL_TYPE_ACCESS); - } - } - posix_acl_release(clone); + if (acl) { + if (!ret) + ret = __btrfs_set_acl(trans, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); } -failed: - posix_acl_release(acl); - - return ret; -} - -int btrfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl, *clone; - int ret = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!IS_POSIXACL(inode)) - return 0; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - ret = posix_acl_chmod_masq(clone, inode->i_mode); - if (!ret) - ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); - - posix_acl_release(clone); + if (!default_acl && !acl) + cache_no_acl(inode); return ret; } - -struct xattr_handler btrfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -struct xattr_handler btrfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -#else /* CONFIG_BTRFS_FS_POSIX_ACL */ - -int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - return 0; -} - -#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 462859a3014..5a201d81049 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -21,697 +22,315 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <linux/workqueue.h> #include "async-thread.h" +#include "ctree.h" + +#define WORK_DONE_BIT 0 +#define WORK_ORDER_DONE_BIT 1 +#define WORK_HIGH_PRIO_BIT 2 + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct __btrfs_workqueue { + struct workqueue_struct *normal_wq; + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + int max_active; + int current_max; + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; -#define WORK_QUEUED_BIT 0 -#define WORK_DONE_BIT 1 -#define WORK_ORDER_DONE_BIT 2 -#define WORK_HIGH_PRIO_BIT 3 - -/* - * container for the kthread task pointer and the list of pending work - * One of these is allocated per thread. - */ -struct btrfs_worker_thread { - /* pool we belong to */ - struct btrfs_workers *workers; - - /* list of struct btrfs_work that are waiting for service */ - struct list_head pending; - struct list_head prio_pending; - - /* list of worker threads from struct btrfs_workers */ - struct list_head worker_list; +struct btrfs_workqueue { + struct __btrfs_workqueue *normal; + struct __btrfs_workqueue *high; +}; - /* kthread */ - struct task_struct *task; +static inline struct __btrfs_workqueue +*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, + int thresh) +{ + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - /* number of things on the pending list */ - atomic_t num_pending; + if (unlikely(!ret)) + return NULL; - /* reference counter for this struct */ - atomic_t refs; + ret->max_active = max_active; + atomic_set(&ret->pending, 0); + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_max = max_active; + ret->thresh = NO_THRESHOLD; + } else { + ret->current_max = 1; + ret->thresh = thresh; + } - unsigned long sequence; + if (flags & WQ_HIGHPRI) + ret->normal_wq = alloc_workqueue("%s-%s-high", flags, + ret->max_active, + "btrfs", name); + else + ret->normal_wq = alloc_workqueue("%s-%s", flags, + ret->max_active, "btrfs", + name); + if (unlikely(!ret->normal_wq)) { + kfree(ret); + return NULL; + } - /* protects the pending list. */ - spinlock_t lock; + INIT_LIST_HEAD(&ret->ordered_list); + spin_lock_init(&ret->list_lock); + spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); + return ret; +} - /* set to non-zero when this thread is already awake and kicking */ - int working; +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - /* are we currently idle */ - int idle; -}; +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); -/* - * btrfs_start_workers uses kthread_run, which can block waiting for memory - * for a very long time. It will actually throttle on page writeback, - * and so it may not make progress until after our btrfs worker threads - * process all of the pending work structs in their queue - * - * This means we can't use btrfs_start_workers from inside a btrfs worker - * thread that is used as part of cleaning dirty memory, which pretty much - * involves all of the worker threads. - * - * Instead we have a helper queue who never has more than one thread - * where we scheduler thread start operations. This worker_start struct - * is used to contain the work and hold a pointer to the queue that needs - * another worker. - */ -struct worker_start { - struct btrfs_work work; - struct btrfs_workers *queue; -}; + if (unlikely(!ret)) + return NULL; -static void start_new_worker_func(struct btrfs_work *work) -{ - struct worker_start *start; - start = container_of(work, struct worker_start, work); - btrfs_start_workers(start->queue, 1); - kfree(start); -} + ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + max_active, thresh); + if (unlikely(!ret->normal)) { + kfree(ret); + return NULL; + } -static int start_new_worker(struct btrfs_workers *queue) -{ - struct worker_start *start; - int ret; - - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return -ENOMEM; - - start->work.func = start_new_worker_func; - start->queue = queue; - ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); - if (ret) - kfree(start); + if (flags & WQ_HIGHPRI) { + ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + thresh); + if (unlikely(!ret->high)) { + __btrfs_destroy_workqueue(ret->normal); + kfree(ret); + return NULL; + } + } return ret; } /* - * helper function to move a thread onto the idle list after it - * has finished some requests. + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook */ -static void check_idle_worker(struct btrfs_worker_thread *worker) +static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) { - if (!worker->idle && atomic_read(&worker->num_pending) < - worker->workers->idle_thresh / 2) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 1; - - /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list)) { - list_move(&worker->worker_list, - &worker->workers->idle_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); } /* - * helper function to move a thread off the idle list after new - * pending work is added. + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. */ -static void check_busy_worker(struct btrfs_worker_thread *worker) -{ - if (worker->idle && atomic_read(&worker->num_pending) >= - worker->workers->idle_thresh) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 0; - - if (!list_empty(&worker->worker_list)) { - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } -} - -static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - struct btrfs_workers *workers = worker->workers; - unsigned long flags; + int new_max_active; + long pending; + int need_change = 0; - rmb(); - if (!workers->atomic_start_pending) + if (wq->thresh == NO_THRESHOLD) return; - spin_lock_irqsave(&workers->lock, flags); - if (!workers->atomic_start_pending) - goto out; - - workers->atomic_start_pending = 0; - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) - goto out; - - workers->num_workers_starting += 1; - spin_unlock_irqrestore(&workers->lock, flags); - start_new_worker(workers); - return; + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_max_active = wq->current_max; + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_max_active++; + if (pending < wq->thresh / 2) + new_max_active--; + new_max_active = clamp_val(new_max_active, 1, wq->max_active); + if (new_max_active != wq->current_max) { + need_change = 1; + wq->current_max = new_max_active; + } out: - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_max); + } } -static noinline int run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void run_ordered_work(struct __btrfs_workqueue *wq) { - if (!workers->ordered) - return 0; - - set_bit(WORK_DONE_BIT, &work->flags); - - spin_lock(&workers->order_lock); + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; while (1) { - if (!list_empty(&workers->prio_order_list)) { - work = list_entry(workers->prio_order_list.next, - struct btrfs_work, order_list); - } else if (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - } else { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) break; - } + work = list_entry(list->next, struct btrfs_work, + ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; - /* we are going to call the ordered done function, but + /* + * we are going to call the ordered done function, but * we leave the work item on the list as a barrier so * that later work items that are done don't have their * functions called before this one returns */ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - - spin_unlock(&workers->order_lock); - + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); work->ordered_func(work); - /* now take the lock again and call the freeing code */ - spin_lock(&workers->order_lock); - list_del(&work->order_list); - work->ordered_free(work); - } + /* now take the lock again and drop our item from the list */ + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); - spin_unlock(&workers->order_lock); - return 0; -} - -static void put_worker(struct btrfs_worker_thread *worker) -{ - if (atomic_dec_and_test(&worker->refs)) - kfree(worker); -} - -static int try_worker_shutdown(struct btrfs_worker_thread *worker) -{ - int freeit = 0; - - spin_lock_irq(&worker->lock); - spin_lock(&worker->workers->lock); - if (worker->workers->num_workers > 1 && - worker->idle && - !worker->working && - !list_empty(&worker->worker_list) && - list_empty(&worker->prio_pending) && - list_empty(&worker->pending) && - atomic_read(&worker->num_pending) == 0) { - freeit = 1; - list_del_init(&worker->worker_list); - worker->workers->num_workers--; + /* + * we don't want to call the ordered free functions + * with the lock held though + */ + work->ordered_free(work); + trace_btrfs_all_work_done(work); } - spin_unlock(&worker->workers->lock); - spin_unlock_irq(&worker->lock); - - if (freeit) - put_worker(worker); - return freeit; + spin_unlock_irqrestore(lock, flags); } -static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, - struct list_head *prio_head, - struct list_head *head) +static void normal_work_helper(struct work_struct *arg) { - struct btrfs_work *work = NULL; - struct list_head *cur = NULL; - - if(!list_empty(prio_head)) - cur = prio_head->next; - - smp_mb(); - if (!list_empty(&worker->prio_pending)) - goto refill; - - if (!list_empty(head)) - cur = head->next; - - if (cur) - goto out; - -refill: - spin_lock_irq(&worker->lock); - list_splice_tail_init(&worker->prio_pending, prio_head); - list_splice_tail_init(&worker->pending, head); - - if (!list_empty(prio_head)) - cur = prio_head->next; - else if (!list_empty(head)) - cur = head->next; - spin_unlock_irq(&worker->lock); - - if (!cur) - goto out_fail; - -out: - work = list_entry(cur, struct btrfs_work, list); - -out_fail: - return work; -} - -/* - * main loop for servicing work items - */ -static int worker_loop(void *arg) -{ - struct btrfs_worker_thread *worker = arg; - struct list_head head; - struct list_head prio_head; struct btrfs_work *work; + struct __btrfs_workqueue *wq; + int need_order = 0; - INIT_LIST_HEAD(&head); - INIT_LIST_HEAD(&prio_head); - - do { -again: - while (1) { - - - work = get_next_work(worker, &prio_head, &head); - if (!work) - break; - - list_del(&work->list); - clear_bit(WORK_QUEUED_BIT, &work->flags); - - work->worker = worker; - - work->func(work); - - atomic_dec(&worker->num_pending); - /* - * unless this is an ordered work queue, - * 'work' was probably freed by func above. - */ - run_ordered_completions(worker->workers, work); - - check_pending_worker_creates(worker); - - } - - spin_lock_irq(&worker->lock); - check_idle_worker(worker); - - if (freezing(current)) { - worker->working = 0; - spin_unlock_irq(&worker->lock); - refrigerator(); - } else { - spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) { - cpu_relax(); - /* - * we've dropped the lock, did someone else - * jump_in? - */ - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - /* - * this short schedule allows more work to - * come in without the queue functions - * needing to go through wake_up_process() - * - * worker->working is still 1, so nobody - * is going to try and wake us up - */ - schedule_timeout(1); - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - if (kthread_should_stop()) - break; - - /* still no more work?, sleep for real */ - spin_lock_irq(&worker->lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) { - spin_unlock_irq(&worker->lock); - goto again; - } - - /* - * this makes sure we get a wakeup when someone - * adds something new to the queue - */ - worker->working = 0; - spin_unlock_irq(&worker->lock); - - if (!kthread_should_stop()) { - schedule_timeout(HZ * 120); - if (!worker->working && - try_worker_shutdown(worker)) { - return 0; - } - } - } - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will wait for all the worker threads to shutdown - */ -int btrfs_stop_workers(struct btrfs_workers *workers) -{ - struct list_head *cur; - struct btrfs_worker_thread *worker; - int can_stop; - - spin_lock_irq(&workers->lock); - list_splice_init(&workers->idle_list, &workers->worker_list); - while (!list_empty(&workers->worker_list)) { - cur = workers->worker_list.next; - worker = list_entry(cur, struct btrfs_worker_thread, - worker_list); - - atomic_inc(&worker->refs); - workers->num_workers -= 1; - if (!list_empty(&worker->worker_list)) { - list_del_init(&worker->worker_list); - put_worker(worker); - can_stop = 1; - } else - can_stop = 0; - spin_unlock_irq(&workers->lock); - if (can_stop) - kthread_stop(worker->task); - spin_lock_irq(&workers->lock); - put_worker(worker); + work = container_of(arg, struct btrfs_work, normal_work); + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + wq = work->wq; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq); } - spin_unlock_irq(&workers->lock); - return 0; + if (!need_order) + trace_btrfs_all_work_done(work); } -/* - * simple init on struct btrfs_workers - */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_helper) +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free) { - workers->num_workers = 0; - workers->num_workers_starting = 0; - INIT_LIST_HEAD(&workers->worker_list); - INIT_LIST_HEAD(&workers->idle_list); - INIT_LIST_HEAD(&workers->order_list); - INIT_LIST_HEAD(&workers->prio_order_list); - spin_lock_init(&workers->lock); - spin_lock_init(&workers->order_lock); - workers->max_workers = max; - workers->idle_thresh = 32; - workers->name = name; - workers->ordered = 0; - workers->atomic_start_pending = 0; - workers->atomic_worker_start = async_helper; + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, normal_work_helper); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; } -/* - * starts new worker threads. This does not enforce the max worker - * count in case you need to temporarily go past it. - */ -static int __btrfs_start_workers(struct btrfs_workers *workers, - int num_workers) +static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - int ret = 0; - int i; - - for (i = 0; i < num_workers; i++) { - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } + unsigned long flags; - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + i); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - kfree(worker); - goto fail; - } - spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); } - return 0; -fail: - btrfs_stop_workers(workers); - return ret; + queue_work(wq->normal_wq, &work->normal_work); + trace_btrfs_work_queued(work); } -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work) { - spin_lock_irq(&workers->lock); - workers->num_workers_starting += num_workers; - spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers, num_workers); -} + struct __btrfs_workqueue *dest_wq; -/* - * run through the list and find a worker thread that doesn't have a lot - * to do right now. This can return null if we aren't yet at the thread - * count limit and all of the threads are busy. - */ -static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) -{ - struct btrfs_worker_thread *worker; - struct list_head *next; - int enforce_min; - - enforce_min = (workers->num_workers + workers->num_workers_starting) < - workers->max_workers; - - /* - * if we find an idle thread, don't move it to the end of the - * idle list. This improves the chance that the next submission - * will reuse the same thread, and maybe catch it while it is still - * working - */ - if (!list_empty(&workers->idle_list)) { - next = workers->idle_list.next; - worker = list_entry(next, struct btrfs_worker_thread, - worker_list); - return worker; - } - if (enforce_min || list_empty(&workers->worker_list)) - return NULL; - - /* - * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced. - * Do the move in batches based on the sequence number. This groups - * requests submitted at roughly the same time onto the same worker. - */ - next = workers->worker_list.next; - worker = list_entry(next, struct btrfs_worker_thread, worker_list); - worker->sequence++; - - if (worker->sequence % workers->idle_thresh == 0) - list_move_tail(next, &workers->worker_list); - return worker; + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) + dest_wq = wq->high; + else + dest_wq = wq->normal; + __btrfs_queue_work(dest_wq, work); } -/* - * selects a worker thread to take the next job. This will either find - * an idle worker, start a new worker up to the max count, or just return - * one of the existing busy workers. - */ -static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker; - unsigned long flags; - struct list_head *fallback; - -again: - spin_lock_irqsave(&workers->lock, flags); - worker = next_worker(workers); - - if (!worker) { - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) { - goto fallback; - } else if (workers->atomic_worker_start) { - workers->atomic_start_pending = 1; - goto fallback; - } else { - workers->num_workers_starting++; - spin_unlock_irqrestore(&workers->lock, flags); - /* we're below the limit, start another worker */ - __btrfs_start_workers(workers, 1); - goto again; - } - } - goto found; - -fallback: - fallback = NULL; - /* - * we have failed to find any workers, just - * return the first one we can find. - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); -found: - /* - * this makes sure the worker doesn't exit before it is placed - * onto a busy/idle list - */ - atomic_inc(&worker->num_pending); - spin_unlock_irqrestore(&workers->lock, flags); - return worker; + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); } -/* - * btrfs_requeue_work just puts the work item back on the tail of the list - * it was taken from. It is intended for use with long running work functions - * that make some progress and want to give the cpu up for others. - */ -int btrfs_requeue_work(struct btrfs_work *work) +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker = work->worker; - unsigned long flags; - int wake = 0; - - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; - - spin_lock_irqsave(&worker->lock, flags); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); - - /* by definition we're busy, take ourselves off the idle - * list - */ - if (worker->idle) { - spin_lock(&worker->workers->lock); - worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - spin_unlock(&worker->workers->lock); - } - if (!worker->working) { - wake = 1; - worker->working = 1; - } - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); -out: - - return 0; + if (!wq) + return; + if (wq->high) + __btrfs_destroy_workqueue(wq->high); + __btrfs_destroy_workqueue(wq->normal); + kfree(wq); } -void btrfs_set_work_high_prio(struct btrfs_work *work) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (!wq) + return; + wq->normal->max_active = max; + if (wq->high) + wq->high->max_active = max; } -/* - * places a struct btrfs_work into the pending queue of one of the kthreads - */ -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_set_work_high_priority(struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - unsigned long flags; - int wake = 0; - - /* don't requeue something already on a list */ - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; - - worker = find_worker(workers); - if (workers->ordered) { - /* - * you're not allowed to do ordered queues from an - * interrupt handler - */ - spin_lock(&workers->order_lock); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { - list_add_tail(&work->order_list, - &workers->prio_order_list); - } else { - list_add_tail(&work->order_list, &workers->order_list); - } - spin_unlock(&workers->order_lock); - } else { - INIT_LIST_HEAD(&work->order_list); - } - - spin_lock_irqsave(&worker->lock, flags); - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); - - /* - * avoid calling into wake_up_process if this thread has already - * been kicked - */ - if (!worker->working) - wake = 1; - worker->working = 1; - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); - -out: - return 0; + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 5077746cf85..9c6b66d15fb 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -19,101 +20,35 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ -struct btrfs_worker_thread; +struct btrfs_workqueue; +/* Internal use only */ +struct __btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); -/* - * This is similar to a workqueue, but it is meant to spread the operations - * across all available cpus instead of just the CPU that was used to - * queue the work. There is also some batching introduced to try and - * cut down on context switches. - * - * By default threads are added on demand up to 2 * the number of cpus. - * Changing struct btrfs_workers->max_workers is one way to prevent - * demand creation of kthreads. - * - * the basic model of these worker threads is to embed a btrfs_work - * structure in your own data struct, and use container_of in a - * work function to get back to your data struct. - */ struct btrfs_work { - /* - * func should be set to the function you want called - * your work struct is passed as the only arg - * - * ordered_func must be set for work sent to an ordered work queue, - * and it is called to complete a given work item in the same - * order they were sent to the queue. - */ - void (*func)(struct btrfs_work *work); - void (*ordered_func)(struct btrfs_work *work); - void (*ordered_free)(struct btrfs_work *work); - - /* - * flags should be set to zero. It is used to make sure the - * struct is only inserted once into the list. - */ + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct __btrfs_workqueue *wq; unsigned long flags; - - /* don't touch these */ - struct btrfs_worker_thread *worker; - struct list_head list; - struct list_head order_list; -}; - -struct btrfs_workers { - /* current number of running workers */ - int num_workers; - - int num_workers_starting; - - /* max number of workers allowed. changed by btrfs_start_workers */ - int max_workers; - - /* once a worker has this many requests or fewer, it is idle */ - int idle_thresh; - - /* force completions in the order they were queued */ - int ordered; - - /* more workers required, but in an interrupt handler */ - int atomic_start_pending; - - /* - * are we allowed to sleep while starting workers or are we required - * to start them at a later time? If we can't sleep, this indicates - * which queue we need to use to schedule thread creation. - */ - struct btrfs_workers *atomic_worker_start; - - /* list with all the work threads. The workers on the idle thread - * may be actively servicing jobs, but they haven't yet hit the - * idle thresh limit above. - */ - struct list_head worker_list; - struct list_head idle_list; - - /* - * when operating in ordered mode, this maintains the list - * of work items waiting for completion - */ - struct list_head order_list; - struct list_head prio_order_list; - - /* lock for finding the next worker thread to queue on */ - spinlock_t lock; - - /* lock for the ordered lists */ - spinlock_t order_lock; - - /* extra name for this worker, used for current->name */ - char *name; }; -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); -int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_starter); -int btrfs_requeue_work(struct btrfs_work *work); -void btrfs_set_work_high_prio(struct btrfs_work *work); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh); +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +void btrfs_set_work_high_priority(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 00000000000..e25564bfcb4 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,1883 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/vmalloc.h> +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" +#include "locking.h" + +struct extent_inode_elem { + u64 inum; + u64 offset; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 offset = 0; + struct extent_inode_elem *e; + + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + offset; + *eie = e; + + return 0; +} + +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { + struct list_head list; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; +}; + +static struct kmem_cache *btrfs_prelim_ref_cache; + +int __init btrfs_prelim_ref_init(void) +{ + btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", + sizeof(struct __prelim_ref), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_prelim_ref_cache) + return -ENOMEM; + return 0; +} + +void btrfs_prelim_ref_exit(void) +{ + if (btrfs_prelim_ref_cache) + kmem_cache_destroy(btrfs_prelim_ref_cache); +} + +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count, + gfp_t gfp_mask) +{ + struct __prelim_ref *ref; + + if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) + return 0; + + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key_for_search = *key; + else + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + + ref->inode_list = NULL; + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos, + u64 total_refs) +{ + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; + struct btrfs_file_extent_item *fi; + struct extent_inode_elem *eie = NULL, *old = NULL; + u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; + + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + return 0; + } + + /* + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. + */ + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + ret = btrfs_next_old_leaf(root, path, time_seq); + + while (!ret && count < total_refs) { + eb = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + old = NULL; + count++; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (ret > 0) + goto next; + ret = ulist_add_merge(parents, eb->start, + (uintptr_t)eie, + (u64 *)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; + } + eie = NULL; + } +next: + ret = btrfs_next_old_item(root, path, time_seq); + } + + if (ret > 0) + ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); + return ret; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos, u64 total_refs) +{ + struct btrfs_root *root; + struct btrfs_key root_key; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + int index; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(root); + goto out; + } + + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); + + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + goto out; + } + + path->lowest_level = level; + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + ref->root_id, level, ref->count, ret, + ref->key_for_search.objectid, ref->key_for_search.type, + ref->key_for_search.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + while (!eb) { + if (WARN_ON(!level)) { + ret = 1; + goto out; + } + level--; + eb = path->nodes[level]; + } + + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos, total_refs); +out: + path->lowest_level = 0; + btrfs_release_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct list_head *head, + const u64 *extent_item_pos, u64 total_refs) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + struct ulist_iterator uiter; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + err = __resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos, + total_refs); + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { + continue; + } else if (err) { + ret = err; + goto out; + } + + /* we put the first parent into the ref at hand */ + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); + ref->parent = node ? node->val : 0; + ref->inode_list = node ? + (struct extent_inode_elem *)(uintptr_t)node->aux : NULL; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, &uiter))) { + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, + GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + goto out; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + new_ref->inode_list = (struct extent_inode_elem *) + (uintptr_t)node->aux; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } +out: + ulist_free(parents); + return ret; +} + +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + fs_info->tree_root->leafsize, 0); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). + * mode = 2: merge identical parents + */ +static void __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + struct __prelim_ref *xchg; + struct extent_inode_elem *eie; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (!ref_for_same_block(ref1, ref2)) + continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } + } else { + if (ref1->parent != ref2->parent) + continue; + } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + + list_del(&ref2->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref2); + } + + } +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct list_head *prefs, u64 *total_refs) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; + int sgn; + int ret = 0; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); + + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + n = rb_next(n); + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + *total_refs += (node->ref_mod * sgn); + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, &op_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, NULL, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn, GFP_ATOMIC); + break; + } + default: + WARN_ON(1); + } + if (ret) + break; + } + spin_unlock(&head->lock); + return ret; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int *info_level, struct list_head *prefs, + u64 *total_refs) +{ + int ret = 0; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0]; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + *total_refs += btrfs_extent_refs(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + *info_level + 1, offset, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int info_level, struct list_head *prefs) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + info_level + 1, key.offset, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count, GFP_NOFS); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1, GFP_NOFS); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count, GFP_NOFS); + break; + } + default: + WARN_ON(1); + } + if (ret) + return ret; + + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist *refs, + struct ulist *roots, const u64 *extent_item_pos) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head; + int info_level = 0; + int ret; + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; + u64 total_refs = 0; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.offset = (u64)-1; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (!trans) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + head = NULL; + + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else + if (trans) { +#endif + /* + * look if there are updates for this ref queued and lock the + * head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + spin_unlock(&delayed_refs->lock); + ret = __add_delayed_refs(head, time_seq, + &prefs_delayed, &total_refs); + mutex_unlock(&head->mutex); + if (ret) + goto out; + } else { + spin_unlock(&delayed_refs->lock); + } + } + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + path->slots[0]--; + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_level, &prefs, + &total_refs); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, + info_level, &prefs); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + list_splice_init(&prefs_delayed, &prefs); + + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + + __merge_refs(&prefs, 1); + + ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, + extent_item_pos, total_refs); + if (ret) + goto out; + + __merge_refs(&prefs, 2); + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + WARN_ON(ref->count < 0); + if (roots && ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + if (ret < 0) + goto out; + } + if (ref->count && ref->parent) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { + u32 bsz; + struct extent_buffer *eb; + bsz = btrfs_level_size(fs_info->extent_root, + ref->level); + eb = read_tree_block(fs_info->extent_root, + ref->parent, bsz, 0); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + free_extent_buffer(eb); + if (ret < 0) + goto out; + ref->inode_list = eie; + } + ret = ulist_add_merge(refs, ref->parent, + (uintptr_t)ref->inode_list, + (u64 *)&eie, GFP_NOFS); + if (ret < 0) + goto out; + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } + eie = NULL; + } + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + +out: + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); + } + if (ret < 0) + free_inode_elem_list(eie); + return ret; +} + +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)(uintptr_t)node->aux; + free_inode_elem_list(eie); + node->aux = 0; + } + + ulist_free(blocks); +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos) +{ + int ret; + + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) + return -ENOMEM; + + ret = find_parent_nodes(trans, fs_info, bytenr, + time_seq, *leafs, NULL, extent_item_pos); + if (ret < 0 && ret != -ENOENT) { + free_leaf_list(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + struct ulist_iterator uiter; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, + time_seq, tmp, *roots, NULL); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + cond_resched(); + } + + ulist_free(tmp); + return 0; +} + +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + int ret; + + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct btrfs_key key; + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_ITEM_KEY, &key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_key *found_key) +{ + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_REF_KEY, found_key); +} + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + +/* + * this iterates to turn a name (from iref/extref) into a full filesystem path. + * Elements of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + int slot; + u64 next_inum; + int ret; + s64 bytes_left = ((s64)size) - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; + struct btrfs_inode_ref *iref; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + path->leave_spinning = 1; + while (1) { + bytes_left -= name_len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + name_off, name_len); + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + } + ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; + if (ret) + break; + + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) { + atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + btrfs_release_path(path); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + path->leave_spinning = leave_spinning; + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) +{ + int ret; + u64 flags; + u64 size = 0; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->extent_root->leafsize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if (found_key->objectid > logical || + found_key->objectid + size <= logical) { + pr_debug("logical %llu is not within any extent\n", logical); + return -ENOENT; + } + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + logical, logical - found_key->objectid, found_key->objectid, + found_key->offset, flags, item_size); + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG_ON(1); + return 0; + } + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_tree_block_info *info; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + *out_level = btrfs_tree_block_level(eb, info); + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, + iterate_extent_inodes_t *iterate, void *ctx) +{ + struct extent_inode_elem *eie; + int ret = 0; + + for (eie = inode_list; eie; eie = eie->next) { + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); + if (ret) { + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); + break; + } + } + + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + u64 extent_item_objectid, u64 extent_item_pos, + int search_commit_root, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + struct btrfs_trans_handle *trans = NULL; + struct ulist *refs = NULL; + struct ulist *roots = NULL; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list tree_mod_seq_elem = {}; + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; + + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); + + if (!search_commit_root) { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); + } + + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + tree_mod_seq_elem.seq, &refs, + &extent_item_pos); + if (ret) + goto out; + + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); + if (ret) + break; + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { + pr_debug("root %llu references leaf %llu, data list " + "%#llx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs((struct extent_inode_elem *) + (uintptr_t)ref_node->aux, + root_node->val, + extent_item_objectid, + iterate, ctx); + } + ulist_free(roots); + } + + free_leaf_list(refs); +out: + if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 extent_item_pos; + u64 flags = 0; + struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; + + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); + btrfs_release_path(path); + if (ret < 0) + return ret; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return -EINVAL; + + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, search_commit_root, + iterate, ctx); + + return ret; +} + +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret = 0; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (!ret) { + ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, + &found_key); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + item = btrfs_item_nr(slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, found_key.objectid, + fs_root->objectid); + ret = iterate(parent, name_len, + (unsigned long)(iref + 1), eb, ctx); + if (ret) + break; + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = iterate(parent, name_len, + (unsigned long)&extref->name, eb, ctx); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += sizeof(*extref); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, iterate_irefs_t *iterate, + void *ctx) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = vmalloc(alloc_bytes); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + if (!ipath) + return; + vfree(ipath->fspath); + kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 00000000000..86fc20fec28 --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include <linux/btrfs.h> +#include "ulist.h" +#include "extent_io.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + u64 extent_item_objectid, + u64 extent_offset, int search_commit_root, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); + +int __init btrfs_prelim_ref_init(void); +void btrfs_prelim_ref_exit(void); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a4dee19983..4794923c410 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,9 +19,31 @@ #ifndef __BTRFS_I__ #define __BTRFS_I__ +#include <linux/hash.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" +#include "delayed-inode.h" + +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 +#define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 +#define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 /* in memory btrfs inode */ struct btrfs_inode { @@ -33,6 +55,9 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Lock for counters */ + spinlock_t lock; + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; @@ -47,12 +72,12 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. @@ -68,42 +93,39 @@ struct btrfs_inode { /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; - /* the space_info for where this inode's data allocations are done */ - struct btrfs_space_info *space_info; + unsigned long runtime_flags; + + /* Keep track of who's O_SYNC/fsyncing currently */ + atomic_t sync_writers; /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ u64 generation; - /* sequence number for NFS changes */ - u64 sequence; - /* * transid of the trans_handle that last modified this inode */ u64 last_trans; /* - * log transid when this inode was last modified + * transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* - * transid that last logged this inode + * log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk @@ -111,17 +133,14 @@ struct btrfs_inode { */ u64 disk_i_size; - /* flags field from the on disk inode */ - u32 flags; - /* * if this is a directory then index_cnt is the counter for the index * number for new files that are created */ u64 index_cnt; - /* the start of block group preferred for allocations. */ - u64 block_group; + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before @@ -131,45 +150,140 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* flags field from the on disk inode */ + u32 flags; + + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - spinlock_t accounting_lock; - int reserved_extents; - int outstanding_extents; - - /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. - */ - unsigned ordered_data_close:1; - unsigned dummy_inode:1; + unsigned outstanding_extents; + unsigned reserved_extents; /* * always compress this one file */ - unsigned force_compress:1; + unsigned force_compress; + + struct btrfs_delayed_node *delayed_node; struct inode vfs_inode; }; +extern unsigned char btrfs_filetype_table[]; + static inline struct btrfs_inode *BTRFS_I(struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); } +static inline unsigned long btrfs_inode_hash(u64 objectid, + const struct btrfs_root *root) +{ + u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + +#if BITS_PER_LONG == 32 + h = (h >> 32) ^ (h & 0xffffffff); +#endif + + return (unsigned long)h; +} + +static inline void btrfs_insert_inode_hash(struct inode *inode) +{ + unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); + + __insert_inode_hash(inode, h); +} + +static inline u64 btrfs_ino(struct inode *inode) +{ + u64 ino = BTRFS_I(inode)->location.objectid; + + /* + * !ino: btree_inode + * type == BTRFS_ROOT_ITEM_KEY: subvol dir + */ + if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) + ino = inode->i_ino; + return ino; +} + static inline void btrfs_i_size_write(struct inode *inode, u64 size) { i_size_write(inode, size); BTRFS_I(inode)->disk_i_size = size; } +static inline bool btrfs_is_free_space_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (root == root->fs_info->tree_root && + btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) + return true; + if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->last_log_commit && + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->root->last_log_commit) + return 1; + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + /* orig_bio is our btrfs_io_bio */ + struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; + u8 csum[0]; +}; + +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_atomic(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); +} + +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 00000000000..ce92ae30250 --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,3295 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" +#include "extent_io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" +#include "rcu-string.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char **datav; + struct page **pagev; + void *mem_to_free; +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret = 0; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + next_bytenr, mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)PAGE_CACHE_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + btrfs_super_magic(super_tmp) != BTRFS_MAGIC || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_leafsize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), dev_bytenr, + dev_state->name, dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + next_bytenr, mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + + BUG_ON(!first_hdr); + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&leafhdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + sf->block_ctx->start, sf->nr, + btrfs_stack_header_generation( + &leafhdr->header), + btrfs_stack_header_owner( + &leafhdr->header)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; + u8 type; + u32 item_offset; + u32 item_size; + + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_size(&disk_item); + disk_key = &disk_item.key; + type = btrfs_disk_key_type(disk_key); + + if (BTRFS_ROOT_ITEM_KEY == type) { + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + item_size > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + item_size); + next_bytenr = btrfs_root_bytenr(&root_item); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + btrfs_root_generation( + &root_item)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = btrfs_stack_header_nritems(&nodehdr->header); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + btrfs_stack_header_generation( + &nodehdr->header), + btrfs_stack_header_owner( + &nodehdr->header)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = btrfs_stack_key_blockptr(&key_ptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &key_ptr.key, + btrfs_stack_key_generation(&key_ptr)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + sf->error = -1; + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch (!= stored %llu).\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, next_block), + next_block->logical_bytenr); + } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, next_block)); + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)next_block_ctx->len) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; + struct btrfsic_block_link *l; + + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr( + &file_extent_item)); + return 0; + } + + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } + generation = btrfs_stack_file_extent_generation(&file_extent_item); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item.type, + btrfs_stack_file_extent_disk_bytenr(&file_extent_item), + btrfs_stack_file_extent_offset(&file_extent_item), + num_bytes); + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, state->datablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + next_bytenr, chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + next_bytenr, + next_block_ctx.dev->name, + next_block_ctx.dev_bytenr, + mirror_num, + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(state->root->fs_info, READ, + bytenr, &length, &multi, mirror_num); + + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out) +{ + block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); + block_ctx_out->dev_bytenr = bytenr; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + if (NULL != block_ctx_out->dev) { + return 0; + } else { + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); + return -ENXIO; + } +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + block_ctx->dev_bytenr); + return -1; + } + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) + return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; + } + + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_iter.bi_sector = dev_bytenr >> 9; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + if (submit_bio_wait(READ, bio)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + unsigned int i; + + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + return 1; + + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + + crc = btrfs_crc32c(crc, data, sublen); + } + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + return 1; + + return 0; /* is metadata */ +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; + + if (NULL != bio_is_patched) + *bio_is_patched = 0; + +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr = 0; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = btrfs_super_bytenr((struct btrfs_super_block *) + mapped_datav[0]); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr); + } + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch" + " (!= stored %llu).\n", + bytenr, dev_state->name, dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block), + block->logical_bytenr); + else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + bytenr, dev_state->name, dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + block->logical_bytenr = bytenr; + } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + bytenr, dev_state->name, dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_disk_key_objectid(&block->disk_key), + block->disk_key.type, + btrfs_disk_key_offset(&block->disk_key), + btrfs_stack_header_generation( + (struct btrfs_header *) mapped_datav[0]), + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_stack_header_generation( + (struct btrfs_header *) + mapped_datav[0])); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + goto continue_loop; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + if (block->is_superblock) + ret = btrfsic_map_superblock(state, bytenr, + processed_len, + bdev, &block_ctx); + else + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", bytenr); + goto continue_loop; + } + block_ctx.datav = mapped_datav; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_datav[0]); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + processed_len = state->datablock_size; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, dev_bytenr); + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.mem_to_free = NULL; + block_ctx.pagev = NULL; + } else { + processed_len = state->metablock_size; + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + bytenr, dev_state->name, dev_bytenr); + + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", + dev_bytenr); + goto continue_loop; + } + } + block_ctx.datav = mapped_datav; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + goto continue_loop; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + block->logical_bytenr, dev_state->name, + block->dev_bytenr, block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + superblock->logical_bytenr, + superblock->dev_state->name, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + superblock->logical_bytenr, + superblock->dev_state->name, + superblock->dev_bytenr, superblock->mirror_num, + btrfs_super_generation(super_hdr), + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key = {0}; + + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_ITEM_KEY); + btrfs_set_disk_key_objectid(&tmp_disk_key, 0); + + switch (pass) { + case 0: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", next_bytenr); + break; + case 1: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", next_bytenr); + break; + case 2: + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(state->root->fs_info, + next_bytenr, BTRFS_SUPER_INFO_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + next_bytenr, mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) + btrfsic_dump_tree(state); + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + l->block_ref_to->generation, + l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, block->flush_gen, + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + block->logical_bytenr, dev_state->name, + block->dev_bytenr, mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(state->root->fs_info, + bytenr, state->metablock_size); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, state->metablock_size, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (WARN_ON(!match)) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + bytenr, dev_state->name, dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + bytenr, block_ctx.dev->name, + block_ctx.dev_bytenr, mirror_num); + } + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," + " size=%zu, data=%p, bdev=%p)\n", + rw, (unsigned long long)bh->b_blocknr, + dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + &bh->b_data, 1, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +static void __btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + u64 cur_bytenr; + int bio_is_patched; + char **mapped_datav; + + dev_bytenr = 512 * bio->bi_iter.bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, + (unsigned long long)bio->bi_iter.bi_sector, + dev_bytenr, bio->bi_bdev); + + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, + GFP_NOFS); + if (!mapped_datav) + goto leave; + cur_bytenr = dev_bytenr; + for (i = 0; i < bio->bi_vcnt; i++) { + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + printk(KERN_INFO + "#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + cur_bytenr += bio->bi_io_vec[i].bv_len; + } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } +leave: + mutex_unlock(&btrfsic_mutex); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + submit_bio(rw, bio); +} + +int btrfsic_submit_bio_wait(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + return submit_bio_wait(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (root->nodesize != root->leafsize) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", + root->nodesize, root->leafsize); + return -1; + } + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, PAGE_CACHE_SIZE); + return -1; + } + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->leafsize, PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, PAGE_CACHE_SIZE); + return -1; + } + state = kzalloc(sizeof(*state), GFP_NOFS); + if (NULL == state) { + printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); + return -1; + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone || b_all->never_written) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 00000000000..13b8566c97a --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +int btrfsic_submit_bio_wait(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#define btrfsic_submit_bio_wait submit_bio_wait +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h deleted file mode 100644 index 7c4503ef6ef..00000000000 --- a/fs/btrfs/compat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _COMPAT_H_ -#define _COMPAT_H_ - -#define btrfs_drop_nlink(inode) drop_nlink(inode) -#define btrfs_inc_nlink(inode) inc_nlink(inode) - -#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 396039b3a8a..1daea0b4718 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> -#include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -62,6 +61,9 @@ struct compressed_bio { /* number of bytes on disk */ unsigned long compressed_len; + /* the compression algorithm for this bio */ + int compress_type; + /* number of compressed pages in the array */ unsigned long nr_pages; @@ -79,10 +81,15 @@ struct compressed_bio { u32 sums; }; +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen); + static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; @@ -91,23 +98,10 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - struct bio *bio; int nr_vecs; nr_vecs = bio_get_nr_vecs(bdev); - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_byte >> 9; - } - return bio; + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); } static int check_compressed_csum(struct inode *inode, @@ -115,7 +109,6 @@ static int check_compressed_csum(struct inode *inode, u64 disk_start) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; struct page *page; unsigned long i; char *kaddr; @@ -129,17 +122,16 @@ static int check_compressed_csum(struct inode *inode, page = cb->compressed_pages[i]; csum = ~(u32)0; - kaddr = kmap_atomic(page, KM_USER0); - csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %lu " - "extent %llu csum %u " - "wanted %u mirror %d\n", inode->i_ino, - (unsigned long long)disk_start, - csum, *cb_sum, cb->mirror_num); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -163,7 +155,6 @@ fail: */ static void end_compressed_bio_read(struct bio *bio, int err) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -180,19 +171,20 @@ static void end_compressed_bio_read(struct bio *bio, int err) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + ret = check_compressed_csum(inode, cb, + (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; /* ok, we're the last bio for this extent, lets start * the decompression. */ - tree = &BTRFS_I(inode)->io_tree; - ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); csum_failed: if (ret) cb->errors = 1; @@ -209,18 +201,16 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int bio_index = 0; - struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + int i; + struct bio_vec *bvec; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ - while (bio_index < cb->orig_bio->bi_vcnt) { + bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bvec++; - bio_index++; - } + bio_endio(cb->orig_bio, 0); } @@ -235,8 +225,8 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline int end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; @@ -262,7 +252,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start, index += ret; } /* the inode may be gone now */ - return 0; } /* @@ -343,14 +332,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct compressed_bio *cb; unsigned long bytes_left; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - int page_index = 0; + int pg_index = 0; struct page *page; u64 first_byte = disk_start; struct block_device *bdev; int ret; + int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -365,17 +357,21 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if (!bio) { + kfree(cb); + return -ENOMEM; + } bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; - for (page_index = 0; page_index < cb->nr_pages; page_index++) { - page = compressed_pages[page_index]; + for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; - if (bio->bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + if (bio->bi_iter.bi_size) + ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); else @@ -394,23 +390,28 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, */ atomic_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, + start, 1); + BUG_ON(ret); /* -ENOMEM */ + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); } if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_CACHE_SIZE; @@ -420,13 +421,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_get(bio); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); /* -ENOMEM */ + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); return 0; @@ -437,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, struct compressed_bio *cb) { unsigned long end_index; - unsigned long page_index; + unsigned long pg_index; u64 last_offset; u64 isize = i_size_read(inode); int ret; @@ -461,15 +464,15 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; while (last_offset < compressed_end) { - page_index = last_offset >> PAGE_CACHE_SHIFT; + pg_index = last_offset >> PAGE_CACHE_SHIFT; - if (page_index > end_index) + if (pg_index > end_index) break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_index); + page = radix_tree_lookup(&mapping->page_tree, pg_index); rcu_read_unlock(); - if (page) { + if (page && !radix_tree_exceptional_entry(page)) { misses++; if (misses > 4) break; @@ -481,7 +484,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!page) break; - if (add_to_page_cache_lru(page, mapping, page_index, + if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { page_cache_release(page); goto next; @@ -494,7 +497,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * sure they map to this compressed extent on disk. */ set_page_extent_mapped(page); - lock_extent(tree, last_offset, end, GFP_NOFS); + lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); @@ -502,9 +505,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_sector) { + (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -518,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (zero_offset) { int zeros; zeros = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } @@ -532,7 +535,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, nr_pages++; page_cache_release(page); } else { - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -548,7 +551,7 @@ next: * in it. We don't actually do IO on those pages but allocate new ones * to hold the compressed pages on disk. * - * bio->bi_sector points to the compressed extent on disk + * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages * bio->bi_vcnt is a count of pages * @@ -565,15 +568,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; unsigned long compressed_len; unsigned long nr_pages; - unsigned long page_index; + unsigned long pg_index; struct page *page; struct block_device *bdev; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; - int ret; + int ret = -ENOMEM; + int faili = 0; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -585,9 +589,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -603,38 +612,54 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; - cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - for (page_index = 0; page_index < nr_pages; page_index++) { - cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!cb->compressed_pages[pg_index]) { + faili = pg_index - 1; + ret = -ENOMEM; + goto fail2; + } } + faili = nr_pages - 1; cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb); + /* In the parent-locked case, we only locked the range we are + * interested in. In all other cases, we can opportunistically + * cache decompressed data that goes beyond the requested range. */ + if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); - for (page_index = 0; page_index < nr_pages; page_index++) { - page = cb->compressed_pages[page_index]; + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_CACHE_SHIFT; - if (comp_bio->bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + if (comp_bio->bi_iter.bi_size) + ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); else @@ -646,7 +671,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * inc the count before we submit the bio so @@ -657,20 +682,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, atomic_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - btrfs_lookup_bio_sums(root, inode, comp_bio, - sums); + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_size + root->sectorsize - 1) / - root->sectorsize; + sums += (comp_bio->bi_iter.bi_size + + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + BUG_ON(!comp_bio); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -681,14 +709,347 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ + } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); return 0; + +fail2: + while (faili >= 0) { + __free_page(cb->compressed_pages[faili]); + faili--; + } + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; +} + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +static struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +void __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + smp_mb(); + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return PTR_ERR(workspace); + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*pg_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + if (*pg_index == (vcnt - 1) && *pg_offset == 0) + memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(kaddr); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*pg_index)++; + if (*pg_index >= vcnt) + return 0; + + page_out = bvec[*pg_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 421f5b4aa71..0c803b4fbf9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,24 +19,25 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); -void btrfs_zlib_exit(void); +void btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset); + int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, @@ -44,4 +45,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6795a713b20..aeab453b8e2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -36,20 +37,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); -static int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); - +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot); +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - if (path) - path->reada = 1; return path; } @@ -61,8 +57,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) { int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (p->nodes[i] && p->locks[i]) - btrfs_set_lock_blocking(p->nodes[i]); + if (!p->nodes[i] || !p->locks[i]) + continue; + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_READ_LOCK) + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + else if (p->locks[i] == BTRFS_WRITE_LOCK) + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; } } @@ -75,7 +76,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) * for held */ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held) + struct extent_buffer *held, int held_rw) { int i; @@ -86,26 +87,38 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, * really sure by forcing the path to blocking before we clear * the path blocking. */ - if (held) - btrfs_set_lock_blocking(held); + if (held) { + btrfs_set_lock_blocking_rw(held, held_rw); + if (held_rw == BTRFS_WRITE_LOCK) + held_rw = BTRFS_WRITE_LOCK_BLOCKING; + else if (held_rw == BTRFS_READ_LOCK) + held_rw = BTRFS_READ_LOCK_BLOCKING; + } btrfs_set_path_blocking(p); #endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) - btrfs_clear_lock_blocking(p->nodes[i]); + if (p->nodes[i] && p->locks[i]) { + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) + p->locks[i] = BTRFS_WRITE_LOCK; + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) + p->locks[i] = BTRFS_READ_LOCK; + } } #ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) - btrfs_clear_lock_blocking(held); + btrfs_clear_lock_blocking_rw(held, held_rw); #endif } /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { - btrfs_release_path(NULL, p); + if (!p) + return; + btrfs_release_path(p); kmem_cache_free(btrfs_path_cachep, p); } @@ -115,7 +128,7 @@ void btrfs_free_path(struct btrfs_path *p) * * It is safe to call this on paths that no locks or extent buffers held. */ -noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +noinline void btrfs_release_path(struct btrfs_path *p) { int i; @@ -124,7 +137,7 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) if (!p->nodes[i]) continue; if (p->locks[i]) { - btrfs_tree_unlock(p->nodes[i]); + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); @@ -145,10 +158,24 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - spin_lock(&root->node_lock); - eb = root->node; - extent_buffer_get(eb); - spin_unlock(&root->node_lock); + + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -163,30 +190,46 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); btrfs_tree_lock(eb); - - spin_lock(&root->node_lock); - if (eb == root->node) { - spin_unlock(&root->node_lock); + if (eb == root->node) break; - } - spin_unlock(&root->node_lock); - btrfs_tree_unlock(eb); free_extent_buffer(eb); } return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. */ static void add_root_to_dirty_list(struct btrfs_root *root) { - if (root->track_dirty && list_empty(&root->dirty_list)) { + spin_lock(&root->fs_info->trans_lock); + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && + list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } + spin_unlock(&root->fs_info->trans_lock); } /* @@ -200,17 +243,16 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid) { struct extent_buffer *cow; - u32 nritems; int ret = 0; int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); if (level == 0) btrfs_item_key(buf, &disk_key, 0); else @@ -233,15 +275,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); if (ret) return ret; @@ -251,6 +292,666 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + u64 seq; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) +{ + read_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) +{ + read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ + write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) +{ + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + tree_mod_log_write_lock(fs_info); + spin_lock(&fs_info->tree_mod_seq_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } + spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + elem->seq = 0; + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if (cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + spin_unlock(&fs_info->tree_mod_seq_lock); + return; + } + min_seq = cur_elem->seq; + } + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tree_mod_log_write_lock(fs_info); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->seq > min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + tree_mod_log_write_unlock(fs_info); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + BUG_ON(!tm); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */ +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (eb && btrfs_header_level(eb) == 0) + return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + + return 0; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); + + return ret; +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + int locked = 0; + + if (!tree_mod_need_log(fs_info, eb)) + return 0; + + tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags, + int log_removal) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + tree_mod_log_read_lock(fs_info); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + tree_mod_log_read_unlock(fs_info); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static noinline int +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; + int i; + int locked = 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static noinline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static noinline int +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; +} + +static noinline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node, + int log_removal) +{ + int ret; + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS, log_removal); + BUG_ON(ret < 0); +} + /* * check if the tree block can be shared by multiple trees */ @@ -263,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -280,7 +981,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) + struct extent_buffer *cow, + int *last_ref) { u64 refs; u64 owner; @@ -307,9 +1009,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, - buf->len, &refs, &flags); - BUG_ON(ret); - BUG_ON(refs == 0); + btrfs_header_level(buf), 1, + &refs, &flags); + if (ret) + return ret; + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || @@ -327,45 +1035,49 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, buf, 1, 1); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_inc_ref(trans, root, cow, 1, 1); + BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); + BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { + int level = btrfs_header_level(buf); + ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, - new_flags, 0); - BUG_ON(ret); + new_flags, level, 0); + if (ret) + return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, buf, 1, 1); + BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); + *last_ref = 1; } return 0; } @@ -391,7 +1103,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *cow; - int level; + int level, ret; + int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -400,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -438,11 +1152,20 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow); + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) + return ret; + } if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -452,13 +1175,12 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else parent_start = 0; - spin_lock(&root->node_lock); - root->node = cow; extent_buffer_get(cow); - spin_unlock(&root->node_lock); + tree_mod_log_set_root_pointer(root, cow, 1); + rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -468,30 +1190,345 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb_root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + int looped = 0; + + if (!time_seq) + return NULL; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * if there are no tree operation for the oldest root, we simply + * return it. this should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * if there's an operation that's not a root replacement, we + * found the oldest version of our root. normally, we'll find a + * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = 1; + } + + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq, struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + tree_mod_log_read_lock(fs_info); + while (tm && tm->seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + /* Fallthrough */ + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + tree_mod_log_read_unlock(fs_info); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct extent_buffer *eb, u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + btrfs_set_path_blocking(path); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(eb->start, + fs_info->tree_root->nodesize); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + extent_buffer_get(eb_rewin); + btrfs_tree_read_lock(eb_rewin); + __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); + + return eb_rewin; +} + +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + u32 blocksize; + + eb_root = btrfs_read_lock_root_node(root); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = eb_root->start; + } + + tm = tree_mod_log_search(root->fs_info, logical, time_seq); + if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + blocksize = btrfs_level_size(root, old_root->level); + old = read_tree_block(root, logical, blocksize, 0); + if (WARN_ON(!old || !extent_buffer_uptodate(old))) { + free_extent_buffer(old); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); + } else { + eb = btrfs_clone_extent_buffer(old); + free_extent_buffer(old); + } + } else if (old_root) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(logical, root->nodesize); + } else { + btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock_blocking(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + extent_buffer_get(eb); + btrfs_tree_read_lock(eb); + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + if (tm) + __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { + level = tm->old_root.level; + } else { + level = btrfs_header_level(eb_root); + } + free_extent_buffer(eb_root); + + return level; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -509,19 +1546,14 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", + trans->transid, root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", + trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -536,6 +1568,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + return ret; } @@ -591,7 +1626,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress) { struct extent_buffer *cur; @@ -611,13 +1646,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; parent_level = btrfs_header_level(parent); - if (cache_only && parent_level != 1) - return 0; - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); @@ -631,14 +1662,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, for (i = start_slot; i < end_slot; i++) { int close = 1; - if (!parent->map_token) { - map_extent_buffer(parent, - btrfs_node_key_ptr_offset(i), - sizeof(struct btrfs_key_ptr), - &parent->map_token, &parent->kaddr, - &parent->map_start, &parent->map_len, - KM_USER1); - } btrfs_node_key(parent, &disk_key, i); if (!progress_passed && comp_keys(&disk_key, progress) < 0) continue; @@ -661,27 +1684,26 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, last_block = blocknr; continue; } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen); + uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { - if (cache_only) { - free_extent_buffer(cur); - continue; - } if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); + if (!cur || !extent_buffer_uptodate(cur)) { + free_extent_buffer(cur); + return -EIO; + } } else if (!uptodate) { - btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } } } if (search_start == 0) @@ -704,11 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(cur); free_extent_buffer(cur); } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } return err; } @@ -726,122 +1743,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root, return btrfs_item_offset_nr(leaf, nr - 1); } -/* - * extra debugging checks to make sure all the items in a key are - * well formed and in the proper order - */ -static int check_node(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *parent = NULL; - struct extent_buffer *node = path->nodes[level]; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key node_key; - int parent_slot; - int slot; - struct btrfs_key cpukey; - u32 nritems = btrfs_header_nritems(node); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - slot = path->slots[level]; - BUG_ON(nritems == 0); - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_node_key(node, &node_key, 0); - BUG_ON(memcmp(&parent_key, &node_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(node)); - } - BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); - if (slot != 0) { - btrfs_node_key_to_cpu(node, &cpukey, slot - 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) <= 0); - } - if (slot < nritems - 1) { - btrfs_node_key_to_cpu(node, &cpukey, slot + 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) >= 0); - } - return 0; -} - -/* - * extra checking to make sure all the items in a leaf are - * well formed and in the proper order - */ -static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *leaf = path->nodes[level]; - struct extent_buffer *parent = NULL; - int parent_slot; - struct btrfs_key cpukey; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key leaf_key; - int slot = path->slots[0]; - - u32 nritems = btrfs_header_nritems(leaf); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - if (nritems == 0) - return 0; - - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_item_key(leaf, &leaf_key, 0); - - BUG_ON(memcmp(&parent_key, &leaf_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(leaf)); - } - if (slot != 0 && slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); - if (comp_keys(&leaf_key, &cpukey) <= 0) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad key\n", slot); - BUG_ON(1); - } - if (btrfs_item_offset_nr(leaf, slot - 1) != - btrfs_item_end_nr(leaf, slot)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - if (slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); - BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - BUG_ON(btrfs_item_offset_nr(leaf, 0) + - btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); - return 0; -} - -static noinline int check_block(struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - return 0; - if (level == 0) - return check_leaf(root, path, level); - return check_node(root, path, level); -} /* * search for key in the extent_buffer. The items start at offset p, @@ -865,7 +1766,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, struct btrfs_disk_key *tmp = NULL; struct btrfs_disk_key unaligned; unsigned long offset; - char *map_token = NULL; char *kaddr = NULL; unsigned long map_start = 0; unsigned long map_len = 0; @@ -875,18 +1775,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, mid = (low + high) / 2; offset = p + mid * item_size; - if (!map_token || offset < map_start || + if (!kaddr || offset < map_start || (offset + sizeof(struct btrfs_disk_key)) > map_start + map_len) { - if (map_token) { - unmap_extent_buffer(eb, map_token, KM_USER0); - map_token = NULL; - } err = map_private_extent_buffer(eb, offset, sizeof(struct btrfs_disk_key), - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - @@ -909,14 +1804,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, high = mid; else { *slot = mid; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 0; } } *slot = low; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 1; } @@ -927,20 +1818,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb, static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot) { - if (level == 0) { + if (level == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), key, btrfs_header_nritems(eb), slot); - } else { + else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), key, btrfs_header_nritems(eb), slot); - } - return -1; } int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, @@ -949,6 +1838,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, return bin_search(eb, key, level, slot); } +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -957,6 +1862,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct extent_buffer *eb; + if (slot < 0) return NULL; if (slot >= btrfs_header_nritems(parent)) @@ -964,9 +1871,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); - return read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(parent, slot)); + eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); + if (eb && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = NULL; + } + + return eb; } /* @@ -986,7 +1899,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - int err_on_enospc = 0; u64 orig_ptr; if (level == 0) @@ -994,14 +1906,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(!path->locks[level]); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1015,15 +1929,23 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - BUG_ON(!child); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } - spin_lock(&root->node_lock); - root->node = child; - spin_unlock(&root->node_lock); + tree_mod_log_set_root_pointer(root, child, 1); + rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -1034,19 +1956,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ - free_extent_buffer(mid); - return ret; + free_extent_buffer_stale(mid); + return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; - left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); @@ -1076,8 +1996,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; } /* @@ -1088,26 +2006,18 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - u64 bytenr = right->start; - u32 blocksize = right->len; - clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - free_extent_buffer(right); + del_ptr(root, path, level + 1, pslot + 1); + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer_stale(right); right = NULL; - wret = del_ptr(trans, root, path, level + 1, pslot + - 1); - if (wret) - ret = wret; - wret = btrfs_free_tree_block(trans, root, - bytenr, blocksize, 0, - root->root_key.objectid, - level); - if (wret) - ret = wret; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1122,7 +2032,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left); + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; @@ -1136,25 +2050,19 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - /* we've managed to empty the middle node, drop it */ - u64 bytenr = mid->start; - u32 blocksize = mid->len; - clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - free_extent_buffer(mid); + del_ptr(root, path, level + 1, pslot); + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer_stale(mid); mid = NULL; - wret = del_ptr(trans, root, path, level + 1, pslot); - if (wret) - ret = wret; - wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, level); - if (wret) - ret = wret; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1177,7 +2085,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } /* double check we haven't messed things up */ - check_block(root, path, level); if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); @@ -1210,18 +2117,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - u64 orig_ptr; if (level == 0) return 1; mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; @@ -1254,6 +2160,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1305,6 +2213,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1341,6 +2251,7 @@ static void reada_for_search(struct btrfs_root *root, u64 search; u64 target; u64 nread = 0; + u64 gen; int direction = path->reada; struct extent_buffer *eb; u32 nr; @@ -1367,6 +2278,7 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + while (1) { if (direction < 0) { if (nr == 0) @@ -1385,8 +2297,8 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(root, search, blocksize, - btrfs_node_ptr_generation(node, nr)); + gen = btrfs_node_ptr_generation(node, nr); + readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } nscan++; @@ -1395,12 +2307,8 @@ static void reada_for_search(struct btrfs_root *root, } } -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) { int slot; int nritems; @@ -1409,12 +2317,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int ret = 0; int blocksize; parent = path->nodes[level + 1]; if (!parent) - return 0; + return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; @@ -1424,7 +2331,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); eb = btrfs_find_tree_block(root, block1, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + /* + * if we get -eagain from btrfs_buffer_uptodate, we + * don't want to return eagain here. That will loop + * forever + */ + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block1 = 0; free_extent_buffer(eb); } @@ -1432,32 +2344,15 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(root, path); - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); } @@ -1475,7 +2370,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1505,8 +2401,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock(t); + btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1532,7 +2433,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) continue; if (!path->locks[i]) continue; - btrfs_tree_unlock(path->nodes[i]); + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; } } @@ -1549,7 +2450,7 @@ static int read_block_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) + struct btrfs_key *key, u64 time_seq) { u64 blocknr; u64 gen; @@ -1563,13 +2464,30 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, level - 1); tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - /* - * we found an up to date block without sleeping, return - * right away + if (tmp) { + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } + + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number */ - *eb_ret = tmp; - return 0; + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; + } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } /* @@ -1582,15 +2500,14 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_unlock_up_safe(p, level + 1); btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); + free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); - btrfs_release_path(NULL, p); + btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, gen); + tmp = read_tree_block(root, blocknr, blocksize, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -1598,7 +2515,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0)) + if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); } @@ -1617,20 +2534,24 @@ read_block_for_search(struct btrfs_trans_handle *trans, static int setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer *b, int level, int ins_len) + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) { int ret; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { int sret; - sret = reada_for_balance(root, p, level); - if (sret) + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); goto again; + } btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -1642,13 +2563,16 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; - sret = reada_for_balance(root, p, level); - if (sret) + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); goto again; + } btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -1656,7 +2580,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } b = p->nodes[level]; if (!b) { - btrfs_release_path(NULL, p); + btrfs_release_path(p); goto again; } BUG_ON(btrfs_header_nritems(b) == 1); @@ -1669,6 +2593,83 @@ done: return ret; } +static void key_search_validate(struct extent_buffer *b, + struct btrfs_key *key, + int level) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, key); + + if (level == 0) + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_leaf, items[0].key), + sizeof(disk_key))); + else + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_node, ptrs[0].key), + sizeof(disk_key))); +#endif +} + +static int key_search(struct extent_buffer *b, struct btrfs_key *key, + int level, int *prev_cmp, int *slot) +{ + if (*prev_cmp != 0) { + *prev_cmp = bin_search(b, key, level, slot); + return *prev_cmp; + } + + key_search_validate(b, key, level); + *slot = 0; + + return 0; +} + +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_path *path; + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + if (found_path == NULL) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } else + path = found_path; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if ((ret < 0) || (found_key == NULL)) { + if (path != found_path) + btrfs_free_path(path); + return ret; + } + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -1692,27 +2693,88 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int err; int level; int lowest_unlock = 1; + int root_lock; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); - if (ins_len < 0) + if (ins_len < 0) { lowest_unlock = 2; + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + + min_write_lock_level = write_lock_level; + again: + prev_cmp = -1; + /* + * we try very hard to do read locks on the root + */ + root_lock = BTRFS_READ_LOCK; + level = 0; if (p->search_commit_root) { + /* + * the commit roots are read only + * so we always do read locks + */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); + level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) - btrfs_tree_lock(b); + btrfs_tree_read_lock(b); } else { - if (p->skip_locking) + if (p->skip_locking) { b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + level = btrfs_header_level(b); + } else { + /* we don't know the level of the root node + * until we actually have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + /* whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* the level might have changed, check again */ + level = btrfs_header_level(b); + } + } } + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; while (b) { level = btrfs_header_level(b); @@ -1721,10 +2783,6 @@ again: * setup the path here so we can release it under lock * contention with the cow code */ - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - if (cow) { /* * if we don't really need to cow this block @@ -1736,26 +2794,30 @@ again: btrfs_set_path_blocking(p); + /* + * must have write locks on this node and the + * parent + */ + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); if (err) { - free_extent_buffer(b); ret = err; goto done; } } cow_done: - BUG_ON(!cow && ins_len); - if (level != btrfs_header_level(b)) - WARN_ON(1); - level = btrfs_header_level(b); - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -1763,21 +2825,21 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); + if (!ins_len && !p->keep_locks) { + int u = level + 1; - ret = check_block(root, p, level); - if (ret) { - ret = -1; - goto done; + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } } - ret = bin_search(b, key, level, &slot); + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -1787,7 +2849,7 @@ cow_done: } p->slots[level] = slot; err = setup_nodes_for_search(trans, root, p, b, level, - ins_len); + ins_len, &write_lock_level); if (err == -EAGAIN) goto again; if (err) { @@ -1797,7 +2859,21 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; - unlock_up(p, level, lowest_unlock); + /* + * slot 0 is special, if we change the key + * we have to update the parent pointer + * which means we must have a write lock + * on the parent + */ + if (slot == 0 && ins_len && + write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1806,7 +2882,7 @@ cow_done: } err = read_block_for_search(trans, root, p, - &b, level, slot, key); + &b, level, slot, key, 0); if (err == -EAGAIN) goto again; if (err) { @@ -1815,23 +2891,42 @@ cow_done: } if (!p->skip_locking) { - btrfs_clear_path_blocking(p, NULL); - err = btrfs_try_spin_lock(b); - - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + err = btrfs_try_tree_write_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_WRITE_LOCK); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; } + p->nodes[level] = b; } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { + if (write_lock_level < 1) { + write_lock_level = 1; + btrfs_release_path(p); + goto again; + } + btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -1840,7 +2935,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -1853,26 +2949,209 @@ done: if (!p->leave_spinning) btrfs_set_path_blocking(p); if (ret < 0) - btrfs_release_path(root, p); + btrfs_release_path(p); return ret; } /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + int prev_cmp = -1; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = get_old_root(root, time_seq); + level = btrfs_header_level(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + /* + * Since we can unwind eb's we want to do a real search every + * time. + */ + prev_cmp = -1; + ret = key_search(b, key, level, &prev_cmp, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret < 0) + return ret; + if (!ret) { + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; + return 0; + } + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } else { + --p->slots[0]; + } + } + return 0; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels * - * If this fails to write a tree block, it returns -1, but continues - * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; - int ret = 0; struct extent_buffer *t; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1880,12 +3159,12 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) break; } - return ret; } /* @@ -1894,9 +3173,8 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -1906,21 +3184,18 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (comp_keys(&disk_key, new_key) >= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) >= 0); } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (comp_keys(&disk_key, new_key) <= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) <= 0); } btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); - return 0; + fixup_low_keys(root, path, &disk_key, 1); } /* @@ -1966,12 +3241,22 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { + /* + * don't call tree_mod_log_eb_move here, key removal was already + * fully logged by tree_mod_log_eb_copy above. + */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -2025,11 +3310,18 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2076,6 +3368,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (IS_ERR(c)) return PTR_ERR(c); + root_add_used(root, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); @@ -2084,13 +3378,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -2101,10 +3393,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); - spin_lock(&root->node_lock); old = root->node; - root->node = c; - spin_unlock(&root->node_lock); + tree_mod_log_set_root_pointer(root, c, 0); + rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2112,7 +3403,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -2123,35 +3414,42 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. - * - * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, u64 bytenr, int slot, int level) +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; + int ret; BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) - BUG(); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { + if (level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } + if (level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD, GFP_NOFS); + BUG_ON(ret < 0); + } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); - return 0; } /* @@ -2172,13 +3470,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; int mid; int ret; - int wret; u32 c_nritems; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { - /* trying to split the root, lets make a new one */ + /* + * trying to split the root, lets make a new one + * + * tree mod log: We don't log_removal old root in + * insert_new_root, because that root buffer will be kept as a + * normal node. We are going to log removal of half of the + * elements below with tree_mod_log_eb_copy. We're holding a + * tree lock on the buffer, which is why we cannot race with + * other tree_mod_log users. + */ ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; @@ -2202,6 +3508,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, if (IS_ERR(split)) return PTR_ERR(split); + root_add_used(root, root->nodesize); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); @@ -2209,13 +3517,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), + btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -2227,11 +3539,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - wret = insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, - level + 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2253,14 +3562,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(start); + end_item = btrfs_item_nr(end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -2278,23 +3594,29 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, int ret; ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), leaf_space_used(leaf, 0, nritems), nritems); } return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2306,10 +3628,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2317,7 +3641,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] > i) @@ -2332,14 +3656,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } - this_item_size = btrfs_item_size(left, item); if (this_item_size + sizeof(*item) + push_space > free_space) break; @@ -2350,16 +3666,11 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, break; i--; } - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } if (push_items == 0) goto out_unlock; - if (!empty && push_items == left_nritems) - WARN_ON(1); + WARN_ON(!empty && push_items == left_nritems); /* push left to right */ right_nritems = btrfs_header_nritems(right); @@ -2394,27 +3705,19 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + item = btrfs_item_nr(i); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } left_nritems -= push_items; btrfs_set_header_nritems(left, left_nritems); if (left_nritems) btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root, left); + btrfs_mark_buffer_dirty(right); btrfs_item_key(right, &disk_key, 0); @@ -2448,10 +3751,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2472,6 +3779,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2493,8 +3803,21 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2504,16 +3827,20 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - int slot; int i; int push_space = 0; int push_items = 0; @@ -2521,26 +3848,19 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_nritems; u32 nr; int ret = 0; - int wret; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; - slot = path->slots[1]; + btrfs_init_map_token(&token); if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -2563,17 +3883,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, push_space += this_item_size + sizeof(*item); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - if (push_items == 0) { ret = 1; goto out; } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ copy_extent_buffer(left, right, @@ -2596,31 +3910,19 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(left, i); - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } + item = btrfs_item_nr(i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, right_nritems); - WARN_ON(1); - } if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - @@ -2639,38 +3941,25 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } + item = btrfs_item_nr(i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); - } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); if (right_nritems) btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = left; @@ -2691,10 +3980,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2716,6 +4009,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -2730,7 +4026,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; + if (ret == -ENOSPC) + ret = 1; goto out; } @@ -2740,8 +4037,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2751,22 +4049,21 @@ out: /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { int data_copy_size; int rt_data_off; int i; - int ret = 0; - int wret; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2785,33 +4082,18 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, btrfs_item_end_nr(l, mid); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); + struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); - } - - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); - ret = 0; btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -2829,8 +4111,67 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); +} - return ret; +/* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + int space_needed = data_size; + + slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; } /* @@ -2855,6 +4196,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2863,12 +4205,19 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size && path->nodes[1]) { + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } @@ -2902,6 +4251,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2918,7 +4269,9 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { - split = 2 ; + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; } } } @@ -2932,10 +4285,10 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, &disk_key, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); + if (IS_ERR(right)) return PTR_ERR(right); - } + + root_add_used(root, root->leafsize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -2944,22 +4297,17 @@ again: btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), + btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -2967,29 +4315,20 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, + insert_ptr(trans, root, path, &disk_key, right->start, path->slots[1], 1); - if (wret) - ret = wret; btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } + if (path->slots[1] == 0) + fixup_low_keys(root, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; } - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); - BUG_ON(ret); + copy_for_split(trans, root, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -2997,7 +4336,14 @@ again: goto again; } - return ret; + return 0; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3026,7 +4372,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); extent_len = btrfs_file_extent_num_bytes(leaf, fi); } - btrfs_release_path(root, path); + btrfs_release_path(path); path->keep_locks = 1; path->search_for_split = 1; @@ -3054,7 +4400,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); - BUG_ON(ret); + if (ret) + goto err; path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); @@ -3085,7 +4432,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -3108,7 +4455,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(leaf, slot); + new_item = btrfs_item_nr(slot); btrfs_set_item_offset(leaf, new_item, orig_offset); btrfs_set_item_size(leaf, new_item, item_size - split_offset); @@ -3191,11 +4538,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - ret = setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - BUG_ON(ret); - + setup_items_for_insert(root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -3210,14 +4555,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, + u32 new_size, int from_end) { - int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3226,14 +4567,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); - slot_orig = path->slots[0]; leaf = path->nodes[0]; slot = path->slots[0]; old_size = btrfs_item_size_nr(leaf, slot); if (old_size == new_size) - return 0; + return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3251,23 +4594,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); - } + item = btrfs_item_nr(i); - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3308,31 +4639,26 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); - ret = 0; if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* - * make the item pointed to by the path bigger, data_size is the new size. + * make the item pointed to by the path bigger, data_size is the added size. */ -int btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { - int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3340,8 +4666,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); - slot_orig = path->slots[0]; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3357,7 +4685,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); } @@ -3368,22 +4696,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); - } + item = btrfs_item_nr(i); - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3393,168 +4710,14 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, data_end = old_data; old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); - ret = 0; if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); BUG(); } - return ret; -} - -/* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int ret = 0; - int slot; - int i; - u32 nritems; - u32 total_data = 0; - u32 total_size = 0; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct btrfs_key found_key; - - for (i = 0; i < nr; i++) { - if (total_size + data_size[i] + sizeof(struct btrfs_item) > - BTRFS_LEAF_DATA_SIZE(root)) { - break; - nr = i; - } - total_data += data_size[i]; - total_size += data_size[i] + sizeof(struct btrfs_item); - } - BUG_ON(nr == 0); - - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - for (i = nr; i >= 0; i--) { - total_data -= data_size[i]; - total_size -= data_size[i] + sizeof(struct btrfs_item); - if (total_size < btrfs_leaf_free_space(root, leaf)) - break; - } - nr = i; - } - - slot = path->slots[0]; - BUG_ON(slot < 0); - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* figure out how many keys we can insert in here */ - total_data = data_size[0]; - for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) - break; - total_data += data_size[i]; - } - nr = i; - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - WARN_ON(leaf->map_token); - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); - } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } else { - /* - * this sucks but it has to be done, if we are inserting at - * the end of the leaf only insert 1 of the items, since we - * have no way of knowing whats on the next leaf and we'd have - * to drop our current locks to figure it out - */ - nr = 1; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); - data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); - } - btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); - } - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -out: - if (!ret) - ret = nr; - return ret; } /* @@ -3562,20 +4725,20 @@ out: * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -static noinline_for_stack int -setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3585,7 +4748,7 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, if (btrfs_leaf_free_space(root, leaf) < total_size) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(root, leaf)); BUG(); } @@ -3595,7 +4758,7 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, if (old_data < data_end) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -3603,27 +4766,14 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + item = btrfs_item_nr( i); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3640,19 +4790,18 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, for (i = 0; i < nr; i++) { btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + item = btrfs_item_nr(slot + i); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); - ret = 0; if (slot == 0) { - struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -3661,7 +4810,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* @@ -3674,7 +4822,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_key *cpu_key, u32 *data_size, int nr) { - struct extent_buffer *leaf; int ret = 0; int slot; int i; @@ -3689,17 +4836,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret == 0) return -EEXIST; if (ret < 0) - goto out; + return ret; - leaf = path->nodes[0]; slot = path->slots[0]; BUG_ON(slot < 0); - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(root, path, cpu_key, data_size, total_data, total_size, nr); - -out: - return ret; + return 0; } /* @@ -3716,7 +4860,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (!ret) { leaf = path->nodes[0]; @@ -3734,22 +4879,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; - int ret = 0; - int wret; + int ret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { + if (level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + BUG_ON(ret < 0); } + nritems--; btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { @@ -3760,12 +4912,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); - if (wret) - ret = wret; + fixup_low_keys(root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); - return ret; } /* @@ -3778,17 +4927,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { - int ret; - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - ret = del_ptr(trans, root, path, 1, path->slots[1]); - if (ret) - return ret; + del_ptr(root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3796,9 +4941,11 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0); - return ret; + root_sub_used(root, leaf->len); + + extent_buffer_get(leaf); + btrfs_free_tree_block(trans, root, leaf, 0, 1); + free_extent_buffer_stale(leaf); } /* * delete the item at the leaf level in path. If that empties @@ -3815,6 +4962,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3835,21 +4985,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); - } - - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; + item = btrfs_item_nr(i); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -3865,8 +5004,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_set_path_blocking(path); + clean_tree_block(trans, root, leaf); + btrfs_del_leaf(trans, root, path, leaf); } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -3874,10 +5014,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, - &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -3890,22 +5027,24 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); + ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -3939,30 +5078,44 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - if (key.offset > 0) + if (key.offset > 0) { key.offset--; - else if (key.type > 0) + } else if (key.type > 0) { key.type--; - else if (key.objectid > 0) + key.offset = (u64)-1; + } else if (key.objectid > 0) { key.objectid--; - else + key.type = (u8)-1; + key.offset = (u64)-1; + } else { return 1; + } - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, and tree logging + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -3982,8 +5135,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans) { struct extent_buffer *cur; @@ -3996,11 +5148,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, WARN_ON(!path->keep_locks); again: - cur = btrfs_lock_root_node(root); + cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); WARN_ON(path->nodes[level]); path->nodes[level] = cur; - path->locks[level] = 1; + path->locks[level] = BTRFS_READ_LOCK; if (btrfs_header_generation(cur) < min_trans) { ret = 1; @@ -4023,43 +5175,18 @@ again: if (sret && slot > 0) slot--; /* - * check this node pointer against the cache_only and - * min_trans parameters. If it isn't in cache or is too - * old, skip to the next one. + * check this node pointer against the min_trans parameters. + * If it is too old, old, skip to the next one. */ while (slot < nritems) { - u64 blockptr; u64 gen; - struct extent_buffer *tmp; - struct btrfs_disk_key disk_key; - blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { slot++; continue; } - if (!cache_only) - break; - - if (max_key) { - btrfs_node_key(cur, &disk_key, slot); - if (comp_keys(&disk_key, max_key) >= 0) { - ret = 1; - goto out; - } - } - - tmp = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - free_extent_buffer(tmp); - break; - } - if (tmp) - free_extent_buffer(tmp); - slot++; + break; } find_next_key: /* @@ -4070,9 +5197,9 @@ find_next_key: path->slots[level] = slot; btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, - cache_only, min_trans); + min_trans); if (sret == 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } else { goto out; @@ -4083,18 +5210,19 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); + BUG_ON(!cur); /* -ENOMEM */ - btrfs_tree_lock(cur); + btrfs_tree_read_lock(cur); - path->locks[level - 1] = 1; + path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1); - btrfs_clear_path_blocking(path, NULL); + unlock_up(path, level, 1, 0, NULL); + btrfs_clear_path_blocking(path, NULL, 0); } out: if (ret == 0) @@ -4103,11 +5231,362 @@ out: return ret; } +static void tree_move_down(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + BUG_ON(*level == 0); + path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], + path->slots[*level]); + path->slots[*level - 1] = 0; + (*level)--; +} + +static int tree_move_next_or_upnext(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(root, path, level, root_level); + } else { + tree_move_down(root, path, level, root_level); + ret = 0; + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_root *left_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&left_root->fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = left_root->commit_root; + extent_buffer_get(left_path->nodes[left_level]); + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = right_root->commit_root; + extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + if (advance_left && !left_end_reached) { + ret = tree_advance(left_root, left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret < 0) + left_end_reached = ADVANCE; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_root, right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret < 0) + right_end_reached = ADVANCE; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result cmp; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_root, left_path, + right_path, tmp_buf); + if (ret) + cmp = BTRFS_COMPARE_TREE_CHANGED; + else + cmp = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, cmp, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kfree(tmp_buf); + return ret; +} + /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. + * tree based on the current path and the min_trans parameters. * * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree @@ -4116,8 +5595,7 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int level, - int cache_only, u64 min_trans) + struct btrfs_key *key, int level, u64 min_trans) { int slot; struct extent_buffer *c; @@ -4150,7 +5628,7 @@ next: btrfs_node_key_to_cpu(c, &cur_key, slot); orig_lowest = path->lowest_level; - btrfs_release_path(root, path); + btrfs_release_path(path); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &cur_key, path, 0, 0); @@ -4168,21 +5646,8 @@ next: if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { - u64 blockptr = btrfs_node_blockptr(c, slot); u64 gen = btrfs_node_ptr_generation(c, slot); - if (cache_only) { - struct extent_buffer *cur; - cur = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - if (!cur || !btrfs_buffer_uptodate(cur, gen)) { - slot++; - if (cur) - free_extent_buffer(cur); - goto next; - } - free_extent_buffer(cur); - } if (gen < min_trans) { slot++; goto next; @@ -4201,6 +5666,12 @@ next: */ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { + return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) +{ int slot; int level; struct extent_buffer *c; @@ -4209,32 +5680,26 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) u32 nritems; int ret; int old_spinning = path->leave_spinning; - int force_blocking = 0; + int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - /* - * we take the blocks in an order that upsets lockdep. Using - * blocking mode is the only way around it. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - force_blocking = 1; -#endif - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); again: level = 1; next = NULL; - btrfs_release_path(root, path); + next_rw_lock = 0; + btrfs_release_path(path); path->keep_locks = 1; + path->leave_spinning = 1; - if (!force_blocking) - path->leave_spinning = 1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (time_seq) + ret = btrfs_search_old_slot(root, &key, path, time_seq); + else + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) @@ -4253,6 +5718,24 @@ again: ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { @@ -4272,31 +5755,44 @@ again: } if (next) { - btrfs_tree_unlock(next); + btrfs_tree_unlock_rw(next, next_rw_lock); free_extent_buffer(next); } next = c; + next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key); + slot, &key, 0); if (ret == -EAGAIN) goto again; if (ret < 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto done; } if (!path->skip_locking) { - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); + if (!ret && time_seq) { + /* + * If we don't get the lock, we may be racing + * with push_leaf_left, holding that lock while + * itself waiting for the leaf we've currently + * locked. To solve this situation, we give up + * on our lock and cycle. + */ + free_extent_buffer(next); + btrfs_release_path(path); + cond_resched(); + goto again; + } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } break; } @@ -4305,43 +5801,40 @@ again: level--; c = path->nodes[level]; if (path->locks[level]) - btrfs_tree_unlock(c); + btrfs_tree_unlock_rw(c, path->locks[level]); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = 1; - + path->locks[level] = next_rw_lock; if (!level) break; ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key); + 0, &key, 0); if (ret == -EAGAIN) goto again; if (ret < 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto done; } if (!path->skip_locking) { - btrfs_assert_tree_locked(path->nodes[level]); - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } } ret = 0; done: - unlock_up(path, 0, 1); + unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); @@ -4391,3 +5884,46 @@ int btrfs_previous_item(struct btrfs_root *root, } return 1; } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746a7248678..be91397f4e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,28 +19,44 @@ #ifndef __BTRFS_CTREE__ #define __BTRFS_CTREE__ -#include <linux/version.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> #include <linux/completion.h> #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/kobject.h> +#include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <linux/pagemap.h> +#include <linux/btrfs.h> +#include <linux/workqueue.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" struct btrfs_trans_handle; struct btrfs_transaction; +struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#define BTRFS_MAGIC "_BHRfS_M" +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define STATIC noinline +#else +#define STATIC static noinline +#endif + +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ + +#define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 @@ -80,6 +96,15 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -98,6 +123,15 @@ struct btrfs_ordered_sum; */ #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + +/* + * The inode number assigned to the special inode for storing + * free ino cache + */ +#define BTRFS_FREE_INO_OBJECTID -12ULL + /* dummy objectid represents multiple objectids */ #define BTRFS_MULTIPLE_OBJECTIDS -255ULL @@ -119,12 +153,27 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0ULL + +/* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + /* * we can actually store much bigger names, but lets not confuse the rest * of linux */ #define BTRFS_NAME_LEN 255 +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* 32 bytes in various csum fields */ #define BTRFS_CSUM_SIZE 32 @@ -136,6 +185,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 #define BTRFS_FT_DIR 2 @@ -147,6 +199,11 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) + +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -180,7 +237,6 @@ struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -#define BTRFS_UUID_SIZE 16 struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; @@ -264,6 +320,22 @@ struct btrfs_chunk { /* additional stripes go here */ } __attribute__ ((__packed__)); +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -271,9 +343,21 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_FSID_SIZE 16 #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ +#define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 +#define BTRFS_FS_STATE_TRANS_ABORTED 2 +#define BTRFS_FS_STATE_DEV_REPLACING 3 + +/* Super block flags */ +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) @@ -324,6 +408,47 @@ struct btrfs_header { #define BTRFS_LABEL_SIZE 256 /* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unused_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -364,9 +489,13 @@ struct btrfs_super_block { char label[BTRFS_LABEL_SIZE]; + __le64 cache_generation; + __le64 uuid_tree_generation; + /* future expansion */ - __le64 reserved[32]; + __le64 reserved[30]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -374,13 +503,49 @@ struct btrfs_super_block { * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) + +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES) + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find @@ -445,6 +610,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; }; /* @@ -473,6 +639,12 @@ struct btrfs_extent_item_v0 { /* use full backrefs for extent pointers in the block */ #define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) +/* + * this flag is only used internally by scrub and may be changed at any time + * it is only declared here to avoid collisions + */ +#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) + struct btrfs_tree_block_info { struct btrfs_disk_key key; u8 level; @@ -521,15 +693,25 @@ struct btrfs_inode_ref { /* name goes here */ } __attribute__ ((__packed__)); +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + struct btrfs_timespec { __le64 sec; __le32 nsec; } __attribute__ ((__packed__)); enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LAST = 2, + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, }; struct btrfs_inode_item { @@ -573,6 +755,14 @@ struct btrfs_dir_item { u8 type; } __attribute__ ((__packed__)); +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -586,6 +776,36 @@ struct btrfs_root_item { struct btrfs_disk_key drop_progress; u8 drop_level; u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Everytime the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + u8 uuid[BTRFS_UUID_SIZE]; + u8 parent_uuid[BTRFS_UUID_SIZE]; + u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); /* @@ -597,6 +817,57 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -655,14 +926,130 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) + +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} struct btrfs_block_group_item { __le64 used; @@ -670,44 +1057,147 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * RESCAN is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 rescan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +/* flags definition for qgroup limits */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + struct btrfs_space_info { - u64 flags; + spinlock_t lock; - u64 total_bytes; /* total bytes in the space */ - u64 bytes_used; /* total bytes used on disk */ + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_super; /* total bytes reserved for the super blocks */ - u64 bytes_root; /* the number of bytes needed to commit a - transaction */ u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ - u64 bytes_delalloc; /* number of bytes currently reserved for - delayed allocation */ + u64 bytes_readonly; /* total bytes that are read only */ - int full; /* indicates that we cannot allocate any more + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ - int force_alloc; /* set if we need to force a chunk alloc for - this space */ - int force_delalloc; /* make people start doing filemap_flush until - we're under a threshold */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - struct list_head list; + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ - /* for controlling how we free up space for allocations */ - wait_queue_head_t allocate_wait; - wait_queue_head_t flush_wait; - int allocating_chunk; - int flushing; + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed everytime we call + * btrfs_free_extent so it is a realtime count of what will be freed + * once the transaction is committed. It will be zero'ed everytime the + * transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + + struct rw_semaphore groups_sem; /* for block groups in our same type */ - struct list_head block_groups; + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +#define BTRFS_BLOCK_RSV_GLOBAL 1 +#define BTRFS_BLOCK_RSV_DELALLOC 2 +#define BTRFS_BLOCK_RSV_TRANS 3 +#define BTRFS_BLOCK_RSV_CHUNK 4 +#define BTRFS_BLOCK_RSV_DELOPS 5 +#define BTRFS_BLOCK_RSV_EMPTY 6 +#define BTRFS_BLOCK_RSV_TEMP 7 + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; spinlock_t lock; - struct rw_semaphore groups_sem; - atomic_t caching_threads; + unsigned short full; + unsigned short type; + unsigned short failfast; }; /* @@ -726,9 +1216,6 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; - /* if this cluster simply points at a bitmap in the block group */ - bool points_to_bitmap; - struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -741,13 +1228,24 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, + BTRFS_CACHE_ERROR = 4, +}; + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN = 0, + BTRFS_DC_ERROR = 1, + BTRFS_DC_CLEAR = 2, + BTRFS_DC_SETUP = 3, + BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; + struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; atomic_t count; @@ -757,17 +1255,31 @@ struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; struct btrfs_fs_info *fs_info; + struct inode *inode; spinlock_t lock; u64 pinned; u64 reserved; + u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 sectorsize; - int extents_thresh; - int free_extents; - int total_bitmaps; - int ro; - int dirty; + u64 cache_generation; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* for raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; + + int disk_cache_state; /* cache tracking stuff */ int cached; @@ -777,9 +1289,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ - spinlock_t tree_lock; - struct rb_root free_space_offset; - u64 free_space; + struct btrfs_free_space_ctl *free_space_ctl; /* block group cache stuff */ struct rb_node cache_node; @@ -794,11 +1304,47 @@ struct btrfs_block_group_cache { * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; + + /* For delayed block group creation */ + struct list_head new_bg_list; +}; + +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; +}; + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + wait_queue_head_t wait; + spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; }; +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +void btrfs_init_async_reclaim_work(struct work_struct *work); + +/* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -808,6 +1354,8 @@ struct btrfs_fs_info { struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -817,43 +1365,100 @@ struct btrfs_fs_info { /* block group cache stuff */ spinlock_t block_group_cache_lock; + u64 first_logical_byte; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; + /* + * block reservation for extent, checksum, root tree and + * delayed dir index item + */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + u64 generation; u64 last_trans_committed; + u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - u64 open_ioctl_trans; unsigned long mount_opt; + unsigned long compress_type:4; + int commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; struct mutex volume_mutex; + + /* this is used during read/modify/write to make sure + * no two ios are trying to mod the same stripe at the same + * time + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -862,52 +1467,67 @@ struct btrfs_fs_info { * before jumping into the main commit. */ struct mutex ordered_operations_mutex; - struct rw_semaphore extent_commit_sem; + + /* + * Same as ordered_operations_mutex except this is for ordered extents + * and not the operations. + */ + struct mutex ordered_extent_flush_mutex; + + struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; + spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; - struct list_head hashers; struct list_head dead_roots; struct list_head caching_block_groups; spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + /* this protects tree_mod_seq_list */ + spinlock_t tree_mod_seq_lock; + atomic64_t tree_mod_seq; + struct list_head tree_mod_seq_list; + + /* this protects tree_mod_log */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; /* - * this is used by the balancing code to wait for all the pending - * ordered extents + * this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* - * all of the data=ordered extents pending writeback + * all fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ - struct list_head delalloc_inodes; - - /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes @@ -919,26 +1539,37 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ - struct btrfs_workers generic_worker; - struct btrfs_workers workers; - struct btrfs_workers delalloc_workers; - struct btrfs_workers endio_workers; - struct btrfs_workers endio_meta_workers; - struct btrfs_workers endio_meta_write_workers; - struct btrfs_workers endio_write_workers; - struct btrfs_workers submit_workers; - struct btrfs_workers enospc_workers; + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct btrfs_workqueue *endio_workers; + struct btrfs_workqueue *endio_meta_workers; + struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *rmw_workers; + struct btrfs_workqueue *endio_meta_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *submit_workers; + struct btrfs_workqueue *caching_workers; + struct btrfs_workqueue *readahead_workers; + /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ - struct btrfs_workers fixup_workers; + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; struct kobject super_kobj; + struct kobject *space_info_kobj; + struct kobject *device_dir_kobj; struct completion kobj_unregister; int do_barriers; int closing; @@ -946,10 +1577,12 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -961,11 +1594,9 @@ struct btrfs_fs_info { */ struct list_head space_info; - struct reloc_control *reloc_ctl; + struct btrfs_space_info *data_sinfo; - spinlock_t delalloc_lock; - spinlock_t new_trans_lock; - u64 delalloc_bytes; + struct reloc_control *reloc_ctl; /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; @@ -973,22 +1604,152 @@ struct btrfs_fs_info { /* all metadata allocations go through this cluster */ struct btrfs_free_cluster meta_alloc_cluster; - spinlock_t ref_cache_lock; - u64 total_ref_cache_size; + /* auto defrag inodes go here */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; + + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; void *bdev_holder; + + /* private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + int scrub_workers_refcnt; + struct btrfs_workqueue *scrub_workers; + struct btrfs_workqueue *scrub_wr_completion_workers; + struct btrfs_workqueue *scrub_nocow_workers; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* + * quota information + */ + unsigned int quota_enabled:1; + + /* + * quota_enabled only changes state after a commit. This holds the + * next state. + */ + unsigned int pending_quota_state:1; + + /* is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* holds configuration and tracking. Protected by qgroup_lock */ + struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; + spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; + + /* + * used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* protect user change for quota operations */ + struct mutex qgroup_ioctl_lock; + + /* list of dirty qgroups to be written at next commit */ + struct list_head dirty_qgroups; + + /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + u64 qgroup_seq; + + /* qgroup rescan items */ + struct mutex qgroup_rescan_lock; /* protects the progress item */ + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + + /* filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + struct radix_tree_root buffer_radix; + + /* next backup root to be overwritten */ + int backup_root_index; + + int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; + + struct semaphore uuid_tree_rescan_sem; + unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; }; +struct btrfs_subvolume_writers { + struct percpu_counter counter; + wait_queue_head_t wait; +}; + +/* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. @@ -996,13 +1757,11 @@ struct btrfs_fs_info { struct btrfs_root { struct extent_buffer *node; - /* the node lock is held while changing the node pointer */ - spinlock_t node_lock; - struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; + unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1012,16 +1771,31 @@ struct btrfs_root { struct completion kobj_unregister; struct mutex objectid_mutex; + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + + /* free ino cache stuff */ + struct btrfs_free_space_ctl *free_ino_ctl; + enum btrfs_caching_type cached; + spinlock_t cache_lock; + wait_queue_head_t cache_wait; + struct btrfs_free_space_ctl *free_ino_pinned; + u64 cache_progress; + struct inode *cache_inode; + struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; - unsigned long log_transid; - unsigned long last_log_commit; - unsigned long log_batch; + atomic_t log_batch; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; pid_t log_start_pid; - bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1040,37 +1814,115 @@ struct btrfs_root { u32 type; u64 highest_objectid; - int ref_cows; - int track_dirty; - int in_radix; - int clean_orphans; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; struct list_head root_list; - spinlock_t list_lock; - struct list_head orphan_list; + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + + spinlock_t orphan_lock; + atomic_t orphan_inodes; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; /* + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock + */ + struct radix_tree_root delayed_nodes_tree; + /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ - struct super_block anon_super; + dev_t anon_dev; + + spinlock_t root_item_lock; + atomic_t refs; + + struct mutex delalloc_mutex; + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; + struct btrfs_subvolume_writers *subv_writers; + atomic_t will_be_snapshoted; +}; + +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; }; + /* * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in @@ -1078,6 +1930,7 @@ struct btrfs_root { */ #define BTRFS_INODE_ITEM_KEY 1 #define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 #define BTRFS_XATTR_ITEM_KEY 24 #define BTRFS_ORPHAN_ITEM_KEY 48 /* reserve 2-15 close to the inode for later flexibility */ @@ -1126,6 +1979,12 @@ struct btrfs_root { */ #define BTRFS_EXTENT_ITEM_KEY 168 +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + #define BTRFS_TREE_BLOCK_REF_KEY 176 #define BTRFS_EXTENT_DATA_REF_KEY 178 @@ -1147,11 +2006,67 @@ struct btrfs_root { #define BTRFS_CHUNK_ITEM_KEY 228 /* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + +#define BTRFS_BALANCE_ITEM_KEY 248 + +/* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* * string items are for debugging. They just store a short string of * data in the FS */ #define BTRFS_STRING_ITEM_KEY 253 +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ #define BTRFS_MOUNT_NODATASUM (1 << 0) #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) @@ -1164,11 +2079,41 @@ struct btrfs_root { #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) +#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) +#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) +#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) +#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) +#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ @@ -1183,6 +2128,20 @@ struct btrfs_root { #define BTRFS_INODE_NODUMP (1 << 8) #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + token->kaddr = NULL; +} /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple @@ -1204,26 +2163,67 @@ struct btrfs_root { offsetof(type, member), \ sizeof(((type *)0)->member))) -#ifndef BTRFS_SETGET_FUNCS +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val, \ + struct btrfs_map_token *token); \ +static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off) \ +{ \ + return btrfs_get_token_##bits(eb, ptr, off, NULL); \ +} \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ +} + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); -#endif +static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ +} \ +static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ +} #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ - kunmap_atomic(p, KM_USER0); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ - kunmap_atomic(p, KM_USER0); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ @@ -1271,14 +2271,14 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); -static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, uuid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } -static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, fsid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); @@ -1336,26 +2336,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } -static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, - struct btrfs_chunk *c, int nr, - u64 val) -{ - btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); -} - static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } -static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, - struct btrfs_chunk *c, int nr, - u64 val) -{ - btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); -} - /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); @@ -1375,6 +2361,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags, BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); @@ -1388,6 +2381,23 @@ BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, + nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); static inline struct btrfs_timespec * btrfs_inode_atime(struct btrfs_inode_item *inode_item) @@ -1413,16 +2423,10 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item) return (struct btrfs_timespec *)ptr; } -static inline struct btrfs_timespec * -btrfs_inode_otime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, otime); - return (struct btrfs_timespec *)ptr; -} - BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, @@ -1433,10 +2437,10 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) { unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); - return (u8 *)((unsigned long)dev + ptr); + return (unsigned long)dev + ptr; } BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); @@ -1504,6 +2508,10 @@ BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, + blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) { @@ -1560,6 +2568,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb, /* struct btrfs_item */ BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(int nr) { @@ -1567,8 +2577,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr) sizeof(struct btrfs_item) * nr; } -static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, - int nr) +static inline struct btrfs_item *btrfs_item_nr(int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(nr); } @@ -1581,30 +2590,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb, static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_end(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_offset(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); + return btrfs_item_size(eb, btrfs_item_nr(nr)); } static inline void btrfs_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(eb, nr); + struct btrfs_item *item = btrfs_item_nr(nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } @@ -1622,6 +2631,13 @@ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, + data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, + name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, + transid, 64); static inline void btrfs_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, @@ -1637,6 +2653,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, write_eb_member(eb, item, struct btrfs_dir_item, location, key); } +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); @@ -1703,6 +2740,12 @@ BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, + nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) { @@ -1738,43 +2781,14 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_fsid(void) { - unsigned long ptr = offsetof(struct btrfs_header, fsid); - return (u8 *)ptr; + return offsetof(struct btrfs_header, fsid); } -static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) { - unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); - return (u8 *)ptr; -} - -static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) -{ - unsigned long ptr = offsetof(struct btrfs_super_block, fsid); - return (u8 *)ptr; -} - -static inline u8 *btrfs_header_csum(struct extent_buffer *eb) -{ - unsigned long ptr = offsetof(struct btrfs_header, csum); - return (u8 *)ptr; -} - -static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) -{ - return NULL; -} - -static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) -{ - return NULL; -} - -static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) -{ - return NULL; + return offsetof(struct btrfs_header, chunk_tree_uuid); } static inline int btrfs_is_leaf(struct extent_buffer *eb) @@ -1800,9 +2814,158 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, + ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, + otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, + stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, + rtransid, 64); + +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} -/* struct btrfs_super_block */ +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); +} +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, @@ -1848,11 +3011,18 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { - int t = btrfs_super_csum_type(s); - BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ return btrfs_csum_sizes[t]; } @@ -1863,6 +3033,18 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) /* struct btrfs_file_extent_item */ BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -1896,15 +3078,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_file_extent_item *e) -{ - return btrfs_file_extent_ram_bytes(eb, e); -} - /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is @@ -1918,25 +3091,145 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } -static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + int slot, + struct btrfs_file_extent_item *fi) { - return sb->s_fs_info; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + /* + * return the space used on disk if this item isn't + * compressed or encoded + */ + if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && + btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && + btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { + return btrfs_file_extent_inline_item_len(eb, + btrfs_item_nr(slot)); + } + + /* otherwise use the ram bytes field */ + return btrfs_token_file_extent_ram_bytes(eb, fi, &token); } -static inline int btrfs_set_root_name(struct btrfs_root *root, - const char *name, int len) + +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index) { - /* if we already have a name just free it */ - kfree(root->name); + u64 val; - root->name = kmalloc(len+1, GFP_KERNEL); - if (!root->name) - return -ENOMEM; + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} - memcpy(root->name, name, len); - root->name[len] = '\0'; +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} - return 0; +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; } static inline u32 btrfs_level_size(struct btrfs_root *root, int level) @@ -1955,43 +3248,72 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) ((unsigned long)(btrfs_leaf_data(leaf) + \ btrfs_item_offset_nr(leaf, slot))) -static inline struct dentry *fdentry(struct file *file) +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) { - return file->f_path.dentry; + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; } /* extent-tree.c */ +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 2 * num_items; +} + +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + num_items; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct btrfs_root *root, + struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner); +int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level); -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, @@ -2000,34 +3322,34 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data); +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); - -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int no_quota); + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, + int delalloc); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 root_objectid, u64 owner, u64 offset, int no_quota); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -2041,27 +3363,82 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group); - -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes); -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); +int btrfs_start_nocow_write(struct btrfs_root *root); +void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2069,18 +3446,34 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, - int cache_only, u64 min_trans); + u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans); +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t cb, void *ctx); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -2091,12 +3484,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u32 data_size); -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, + u32 data_size); +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, + u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2106,17 +3497,26 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress); -void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); +void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2128,13 +3528,11 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, void *data, u32 data_size); -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2151,12 +3549,71 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); +static inline int btrfs_next_old_item(struct btrfs_root *root, + struct btrfs_path *p, u64 time_seq) +{ + ++p->slots[0]; + if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) + return btrfs_next_old_leaf(root, p, time_seq); + return 0; +} +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root->fs_info->sb->s_flags & MS_RDONLY || + btrfs_fs_closing(root->fs_info)); +} + +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} + +/* tree mod log functions from ctree.c */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -2174,21 +3631,37 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct - btrfs_root_item *item, struct btrfs_key *key); -int btrfs_search_root(struct btrfs_root *root, u64 search_start, - u64 *found_objectid); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +/* uuid-tree.c */ +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)); + /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, - int name_len, u64 dir, + int name_len, struct inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2205,9 +3678,6 @@ struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2222,6 +3692,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -2230,12 +3703,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); -/* inode-map.c */ -int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, - struct btrfs_root *fs_root, - u64 dirid, u64 *objectid); -int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); - /* inode-item.c */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2252,11 +3719,28 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, + u64 ref_objectid, const char *name, + int name_len, + struct btrfs_inode_extref **extref_ret); + /* file-item.c */ +struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct btrfs_dio_private *dip, struct bio *bio, + u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2272,18 +3756,34 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, - u64 start, unsigned long len); -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow); -int btrfs_csum_truncate(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u64 isize); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, - u64 end, struct list_head *list); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes); /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) @@ -2292,6 +3792,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, #define PageChecked PageFsMisc #endif +/* This forces readahead on a given range of bytes in an inode */ +static inline void btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, unsigned long req_size) +{ + page_cache_sync_readahead(mapping, ra, file, offset, req_size); +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -2305,84 +3813,114 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, unsigned long bio_flags); - -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index); + struct btrfs_root *parent_root, + u64 new_dirid); +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); -void btrfs_delete_inode(struct inode *inode); -void btrfs_put_inode(struct inode *inode); +void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); -void btrfs_drop_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); -int btrfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t page_offset, u64 start, u64 end, + size_t pg_offset, u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); -void btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t size); -int btrfs_invalidate_inodes(struct btrfs_root *root); +int btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); +void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); +int btrfs_is_empty_uuid(u8 *uuid); +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + /* file.c */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); -int btrfs_check_file(struct btrfs_root *root, struct inode *inode); +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache); +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only); + struct btrfs_root *root); /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); -int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); -int btrfs_sysfs_add_root(struct btrfs_root *root); -void btrfs_sysfs_del_root(struct btrfs_root *root); -void btrfs_sysfs_del_super(struct btrfs_fs_info *root); +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -2391,15 +3929,151 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); +#ifdef CONFIG_PRINTK +__printf(2, 3) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else -#define btrfs_check_acl NULL +static inline __printf(2, 3) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} #endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +#ifdef DEBUG +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#endif + +#ifdef CONFIG_BTRFS_ASSERT + +static inline void assfail(char *expr, char *file, int line) +{ + pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) ((void)0) +#endif + +#define btrfs_assert() +__printf(5, 6) +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + + +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno); + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ + +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + __btrfs_abort_transaction(trans, root, __func__, \ + __LINE__, errno); \ +} while (0) + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, \ + __LINE__, (errno), NULL); \ +} while (0) + +#define btrfs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_std_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +/* acl.c */ +#ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_acl_chmod(struct inode *inode); +#else +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); @@ -2409,4 +4083,66 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); + +/* scrub.c */ +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_root *root); +void btrfs_scrub_continue(struct btrfs_root *root); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress); + +/* dev-replace.c */ +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); + +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +/* Sanity test specific functions */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c new file mode 100644 index 00000000000..da775bfdebc --- /dev/null +++ b/fs/btrfs/delayed-inode.c @@ -0,0 +1,1980 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie <miaox@cn.fujitsu.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "delayed-inode.h" +#include "disk-io.h" +#include "transaction.h" +#include "ctree.h" + +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 + +static struct kmem_cache *delayed_node_cache; + +int __init btrfs_delayed_inode_init(void) +{ + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", + sizeof(struct btrfs_delayed_node), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!delayed_node_cache) + return -ENOMEM; + return 0; +} + +void btrfs_delayed_inode_exit(void) +{ + if (delayed_node_cache) + kmem_cache_destroy(delayed_node_cache); +} + +static inline void btrfs_init_delayed_node( + struct btrfs_delayed_node *delayed_node, + struct btrfs_root *root, u64 inode_id) +{ + delayed_node->root = root; + delayed_node->inode_id = inode_id; + atomic_set(&delayed_node->refs, 0); + delayed_node->count = 0; + delayed_node->flags = 0; + delayed_node->ins_root = RB_ROOT; + delayed_node->del_root = RB_ROOT; + mutex_init(&delayed_node->mutex); + delayed_node->index_cnt = 0; + INIT_LIST_HEAD(&delayed_node->n_list); + INIT_LIST_HEAD(&delayed_node->p_list); + delayed_node->bytes_reserved = 0; + memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); +} + +static inline int btrfs_is_continuous_delayed_item( + struct btrfs_delayed_item *item1, + struct btrfs_delayed_item *item2) +{ + if (item1->key.type == BTRFS_DIR_INDEX_KEY && + item1->key.objectid == item2->key.objectid && + item1->key.type == item2->key.type && + item1->key.offset + 1 == item2->key.offset) + return 1; + return 0; +} + +static inline struct btrfs_delayed_root *btrfs_get_delayed_root( + struct btrfs_root *root) +{ + return root->fs_info->delayed_root; +} + +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) +{ + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + struct btrfs_delayed_node *node; + + node = ACCESS_ONCE(btrfs_inode->delayed_node); + if (node) { + atomic_inc(&node->refs); + return node; + } + + spin_lock(&root->inode_lock); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { + if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); + spin_unlock(&root->inode_lock); + return node; + } + btrfs_inode->delayed_node = node; + /* can be accessed and cached in the inode */ + atomic_add(2, &node->refs); + spin_unlock(&root->inode_lock); + return node; + } + spin_unlock(&root->inode_lock); + + return NULL; +} + +/* Will return either the node or PTR_ERR(-ENOMEM) */ +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); + + /* cached in the btrfs inode and can be accessed */ + atomic_add(2, &node->refs); + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } + btrfs_inode->delayed_node = node; + spin_unlock(&root->inode_lock); + radix_tree_preload_end(); + + return node; +} + +/* + * Call it when holding delayed_node->mutex + * + * If mod = 1, add this node into the prepared list. + */ +static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node, + int mod) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + if (!list_empty(&node->p_list)) + list_move_tail(&node->p_list, &root->prepare_list); + else if (mod) + list_add_tail(&node->p_list, &root->prepare_list); + } else { + list_add_tail(&node->n_list, &root->node_list); + list_add_tail(&node->p_list, &root->prepare_list); + atomic_inc(&node->refs); /* inserted into list */ + root->nodes++; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +/* Call it when holding delayed_node->mutex */ +static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node) +{ + spin_lock(&root->lock); + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + root->nodes--; + atomic_dec(&node->refs); /* not in the list */ + list_del_init(&node->n_list); + if (!list_empty(&node->p_list)) + list_del_init(&node->p_list); + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); + } + spin_unlock(&root->lock); +} + +static struct btrfs_delayed_node *btrfs_first_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->node_list)) + goto out; + + p = delayed_root->node_list.next; + node = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static struct btrfs_delayed_node *btrfs_next_delayed_node( + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_root *delayed_root; + struct list_head *p; + struct btrfs_delayed_node *next = NULL; + + delayed_root = node->root->fs_info->delayed_root; + spin_lock(&delayed_root->lock); + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ + if (list_empty(&delayed_root->node_list)) + goto out; + p = delayed_root->node_list.next; + } else if (list_is_last(&node->n_list, &delayed_root->node_list)) + goto out; + else + p = node->n_list.next; + + next = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&next->refs); +out: + spin_unlock(&delayed_root->lock); + + return next; +} + +static void __btrfs_release_delayed_node( + struct btrfs_delayed_node *delayed_node, + int mod) +{ + struct btrfs_delayed_root *delayed_root; + + if (!delayed_node) + return; + + delayed_root = delayed_node->root->fs_info->delayed_root; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->count) + btrfs_queue_delayed_node(delayed_root, delayed_node, mod); + else + btrfs_dequeue_delayed_node(delayed_root, delayed_node); + mutex_unlock(&delayed_node->mutex); + + if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; + struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); + if (atomic_read(&delayed_node->refs) == 0) { + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); + free = true; + } + spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); + } +} + +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 0); +} + +static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->prepare_list)) + goto out; + + p = delayed_root->prepare_list.next; + list_del_init(p); + node = list_entry(p, struct btrfs_delayed_node, p_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static inline void btrfs_release_prepared_delayed_node( + struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 1); +} + +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +{ + struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + if (item) { + item->data_len = data_len; + item->ins_or_del = 0; + item->bytes_reserved = 0; + item->delayed_node = NULL; + atomic_set(&item->refs, 1); + } + return item; +} + +/* + * __btrfs_lookup_delayed_item - look up the delayed item by key + * @delayed_node: pointer to the delayed node + * @key: the key to look up + * @prev: used to store the prev item if the right item isn't found + * @next: used to store the next item if the right item isn't found + * + * Note: if we don't find the right item, we will return the prev item and + * the next item. + */ +static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( + struct rb_root *root, + struct btrfs_key *key, + struct btrfs_delayed_item **prev, + struct btrfs_delayed_item **next) +{ + struct rb_node *node, *prev_node = NULL; + struct btrfs_delayed_item *delayed_item = NULL; + int ret = 0; + + node = root->rb_node; + + while (node) { + delayed_item = rb_entry(node, struct btrfs_delayed_item, + rb_node); + prev_node = node; + ret = btrfs_comp_cpu_keys(&delayed_item->key, key); + if (ret < 0) + node = node->rb_right; + else if (ret > 0) + node = node->rb_left; + else + return delayed_item; + } + + if (prev) { + if (!prev_node) + *prev = NULL; + else if (ret < 0) + *prev = delayed_item; + else if ((node = rb_prev(prev_node)) != NULL) { + *prev = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *prev = NULL; + } + + if (next) { + if (!prev_node) + *next = NULL; + else if (ret > 0) + *next = delayed_item; + else if ((node = rb_next(prev_node)) != NULL) { + *next = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *next = NULL; + } + return NULL; +} + +static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + NULL, NULL); + return item; +} + +static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, + struct btrfs_delayed_item *ins, + int action) +{ + struct rb_node **p, *node; + struct rb_node *parent_node = NULL; + struct rb_root *root; + struct btrfs_delayed_item *item; + int cmp; + + if (action == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_node->ins_root; + else if (action == BTRFS_DELAYED_DELETION_ITEM) + root = &delayed_node->del_root; + else + BUG(); + p = &root->rb_node; + node = &ins->rb_node; + + while (*p) { + parent_node = *p; + item = rb_entry(parent_node, struct btrfs_delayed_item, + rb_node); + + cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); + if (cmp < 0) + p = &(*p)->rb_right; + else if (cmp > 0) + p = &(*p)->rb_left; + else + return -EEXIST; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + ins->delayed_node = delayed_node; + ins->ins_or_del = action; + + if (ins->key.type == BTRFS_DIR_INDEX_KEY && + action == BTRFS_DELAYED_INSERTION_ITEM && + ins->key.offset >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->key.offset + 1; + + delayed_node->count++; + atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + return 0; +} + +static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_INSERTION_ITEM); +} + +static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_DELETION_ITEM); +} + +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + +static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) +{ + struct rb_root *root; + struct btrfs_delayed_root *delayed_root; + + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + + BUG_ON(!delayed_root); + BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && + delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); + + if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_item->delayed_node->ins_root; + else + root = &delayed_item->delayed_node->del_root; + + rb_erase(&delayed_item->rb_node, root); + delayed_item->delayed_node->count--; + + finish_one_item(delayed_root); +} + +static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) +{ + if (item) { + __btrfs_remove_delayed_item(item); + if (atomic_dec_and_test(&item->refs)) + kfree(item); + } +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->ins_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->del_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +static struct btrfs_delayed_item *__btrfs_next_delayed_item( + struct btrfs_delayed_item *item) +{ + struct rb_node *p; + struct btrfs_delayed_item *next = NULL; + + p = rb_next(&item->rb_node); + if (p) + next = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return next; +} + +static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + + if (!trans->bytes_reserved) + return 0; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); + item->bytes_reserved = num_bytes; + } + + return ret; +} + +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *rsv; + + if (!item->bytes_reserved) + return; + + rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); + btrfs_block_rsv_release(root, rsv, + item->bytes_reserved); +} + +static int btrfs_delayed_inode_reserve_metadata( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + bool release = false; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) { + node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } + return ret; + } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { + spin_lock(&BTRFS_I(inode)->lock); + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!WARN_ON(ret)) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; + } + +migrate: + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); + node->bytes_reserved = num_bytes; + } + + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); + btrfs_block_rsv_release(root, src_rsv, num_bytes); + } + + return ret; +} + +static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *rsv; + + if (!node->bytes_reserved) + return; + + rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); + btrfs_block_rsv_release(root, rsv, + node->bytes_reserved); + node->bytes_reserved = 0; +} + +/* + * This helper will insert some continuous items into the same leaf according + * to the free space of the leaf. + */ +static int btrfs_batch_insert_items(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + int free_space; + int total_data_size = 0, total_size = 0; + struct extent_buffer *leaf; + char *data_ptr; + struct btrfs_key *keys; + u32 *data_size; + struct list_head head; + int slot; + int nitems; + int i; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + free_space = btrfs_leaf_free_space(root, leaf); + INIT_LIST_HEAD(&head); + + next = item; + nitems = 0; + + /* + * count the number of the continuous items that we can insert in batch + */ + while (total_size + next->data_len + sizeof(struct btrfs_item) <= + free_space) { + total_data_size += next->data_len; + total_size += next->data_len + sizeof(struct btrfs_item); + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + } + + if (!nitems) { + ret = 0; + goto out; + } + + /* + * we need allocate some memory space, but it might cause the task + * to sleep, so we set all locked nodes in the path to blocking locks + * first. + */ + btrfs_set_path_blocking(path); + + keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) { + ret = -ENOMEM; + goto out; + } + + data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); + if (!data_size) { + ret = -ENOMEM; + goto error; + } + + /* get keys of all the delayed items */ + i = 0; + list_for_each_entry(next, &head, tree_list) { + keys[i] = next->key; + data_size[i] = next->data_len; + i++; + } + + /* reset all the locked nodes in the patch to spinning locks. */ + btrfs_clear_path_blocking(path, NULL, 0); + + /* insert the keys of the items */ + setup_items_for_insert(root, path, keys, data_size, + total_data_size, total_size, nitems); + + /* insert the dir index items */ + slot = path->slots[0]; + list_for_each_entry_safe(curr, next, &head, tree_list) { + data_ptr = btrfs_item_ptr(leaf, slot, char); + write_extent_buffer(leaf, &curr->data, + (unsigned long)data_ptr, + curr->data_len); + slot++; + + btrfs_delayed_item_release_metadata(root, curr); + + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +error: + kfree(data_size); + kfree(keys); +out: + return ret; +} + +/* + * This helper can just do simple insertion that needn't extend item for new + * data, such as directory name index insertion, inode insertion. + */ +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *delayed_item) +{ + struct extent_buffer *leaf; + char *ptr; + int ret; + + ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, + delayed_item->data_len); + if (ret < 0 && ret != -EEXIST) + return ret; + + leaf = path->nodes[0]; + + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + + write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, + delayed_item->data_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_delayed_item_release_metadata(root, delayed_item); + return 0; +} + +/* + * we insert an item first, then if there are some continuous items, we try + * to insert those items into the same leaf. + */ +static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) + goto insert_end; + + ret = btrfs_insert_delayed_item(trans, root, path, curr); + if (ret < 0) { + btrfs_release_path(path); + goto insert_end; + } + + prev = curr; + curr = __btrfs_next_delayed_item(prev); + if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { + /* insert the continuous items into the same leaf */ + path->slots[0]++; + btrfs_batch_insert_items(root, path, curr); + } + btrfs_release_delayed_item(prev); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +insert_end: + mutex_unlock(&node->mutex); + return ret; +} + +static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + struct extent_buffer *leaf; + struct btrfs_key key; + struct list_head head; + int nitems, i, last_item; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + + i = path->slots[0]; + last_item = btrfs_header_nritems(leaf) - 1; + if (i > last_item) + return -ENOENT; /* FIXME: Is errno suitable? */ + + next = item; + INIT_LIST_HEAD(&head); + btrfs_item_key_to_cpu(leaf, &key, i); + nitems = 0; + /* + * count the number of the dir index items that we can delete in batch + */ + while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + + i++; + if (i > last_item) + break; + btrfs_item_key_to_cpu(leaf, &key, i); + } + + if (!nitems) + return 0; + + ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); + if (ret) + goto out; + + list_for_each_entry_safe(curr, next, &head, tree_list) { + btrfs_delayed_item_release_metadata(root, curr); + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +out: + return ret; +} + +static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_deletion_item(node); + if (!curr) + goto delete_fail; + + ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + if (ret < 0) + goto delete_fail; + else if (ret > 0) { + /* + * can't find the item which the node points to, so this node + * is invalid, just drop it. + */ + prev = curr; + curr = __btrfs_next_delayed_item(prev); + btrfs_release_delayed_item(prev); + ret = 0; + btrfs_release_path(path); + if (curr) { + mutex_unlock(&node->mutex); + goto do_again; + } else + goto delete_fail; + } + + btrfs_batch_delete_items(trans, root, path, curr); + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +delete_fail: + btrfs_release_path(path); + mutex_unlock(&node->mutex); + return ret; +} + +static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + BUG_ON(!delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } +} + +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); +} + +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + struct btrfs_key key; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + int mod; + int ret; + + key.objectid = node->inode_id; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); + if (ret > 0) { + btrfs_release_path(path); + return -ENOENT; + } else if (ret < 0) { + return ret; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item)); + btrfs_mark_buffer_dirty(leaf); + + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto no_iref; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); +no_iref: + btrfs_release_path(path); +err_out: + btrfs_delayed_inode_release_metadata(root, node); + btrfs_release_delayed_inode(node); + + return ret; + +search: + btrfs_release_path(path); + + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = -1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; +} + +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret = 0; + bool count = (nr > 0); + + if (trans->aborted) + return -EIO; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node && (!count || (count && nr--))) { + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); + if (ret) { + btrfs_release_delayed_node(curr_node); + curr_node = NULL; + btrfs_abort_transaction(trans, root, ret); + break; + } + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } + + if (curr_node) + btrfs_release_delayed_node(curr_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_run_delayed_items(trans, root, -1); +} + +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + return __btrfs_run_delayed_items(trans, root, nr); +} + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + path = btrfs_alloc_path(); + if (!path) { + btrfs_release_delayed_node(delayed_node); + return -ENOMEM; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_commit_inode_delayed_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans, delayed_node->root); + btrfs_btree_balance_dirty(delayed_node->root); +out: + btrfs_release_delayed_node(delayed_node); + + return ret; +} + +void btrfs_remove_delayed_node(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node); + if (!delayed_node) + return; + + BTRFS_I(inode)->delayed_node = NULL; + btrfs_release_delayed_node(delayed_node); +} + +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; + struct btrfs_work work; +}; + +static void btrfs_async_run_delayed_root(struct btrfs_work *work) +{ + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + int total_done = 0; + + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; + + path = btrfs_alloc_path(); + if (!path) + goto out; + +again: + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + + path->leave_spinning = 1; + root = delayed_node->root; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + goto release_path; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; + + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work->nr == 0 || total_done < async_work->nr) + goto again; + +free_path: + btrfs_free_path(path); +out: + wake_up(&delayed_root->wait); + kfree(async_work); +} + + +static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, + struct btrfs_root *root, int nr) +{ + struct btrfs_async_delayed_work *async_work; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return 0; + + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) + return -ENOMEM; + + async_work->delayed_root = delayed_root; + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, + NULL, NULL); + async_work->nr = nr; + + btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); + return 0; +} + +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return 1; + + return 0; +} + +void btrfs_balance_delayed_items(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + + delayed_root = btrfs_get_delayed_root(root); + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return; + + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; + int ret; + + seq = atomic_read(&delayed_root->items_seq); + + ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); + if (ret) + return; + + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; + } + + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); +} + +/* Will return 0 or -ENOMEM */ +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *delayed_item; + struct btrfs_dir_item *dir_item; + int ret; + + delayed_node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + if (!delayed_item) { + ret = -ENOMEM; + goto release_node; + } + + delayed_item->key.objectid = btrfs_ino(dir); + btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); + delayed_item->key.offset = index; + + dir_item = (struct btrfs_dir_item *)delayed_item->data; + dir_item->location = *disk_key; + btrfs_set_stack_dir_transid(dir_item, trans->transid); + btrfs_set_stack_dir_data_len(dir_item, 0); + btrfs_set_stack_dir_name_len(dir_item, name_len); + btrfs_set_stack_dir_type(dir_item, type); + memcpy((char *)(dir_item + 1), name, name_len); + + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + + mutex_lock(&delayed_node->mutex); + ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) " + "into the insertion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)", + name_len, name, delayed_node->root->objectid, + delayed_node->inode_id, ret); + BUG(); + } + mutex_unlock(&delayed_node->mutex); + +release_node: + btrfs_release_delayed_node(delayed_node); + return ret; +} + +static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, + struct btrfs_delayed_node *node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_lookup_delayed_insertion_item(node, key); + if (!item) { + mutex_unlock(&node->mutex); + return 1; + } + + btrfs_delayed_item_release_metadata(root, item); + btrfs_release_delayed_item(item); + mutex_unlock(&node->mutex); + return 0; +} + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_key item_key; + int ret; + + node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(node)) + return PTR_ERR(node); + + item_key.objectid = btrfs_ino(dir); + btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); + item_key.offset = index; + + ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); + if (!ret) + goto end; + + item = btrfs_alloc_delayed_item(0); + if (!item) { + ret = -ENOMEM; + goto end; + } + + item->key = item_key; + + ret = btrfs_delayed_item_reserve_metadata(trans, root, item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible. + */ + BUG_ON(ret); + + mutex_lock(&node->mutex); + ret = __btrfs_add_delayed_deletion_item(node, item); + if (unlikely(ret)) { + btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) " + "into the deletion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)", + index, node->root->objectid, node->inode_id, + ret); + BUG(); + } + mutex_unlock(&node->mutex); +end: + btrfs_release_delayed_node(node); + return ret; +} + +int btrfs_inode_delayed_dir_index_count(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + + if (!delayed_node) + return -ENOENT; + + /* + * Since we have held i_mutex of this directory, it is impossible that + * a new directory index is added into the delayed node and index_cnt + * is updated now. So we needn't lock the delayed node. + */ + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); + return -EINVAL; + } + + BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; + btrfs_release_delayed_node(delayed_node); + return 0; +} + +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *item; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + mutex_lock(&delayed_node->mutex); + item = __btrfs_first_delayed_insertion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, ins_list); + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, del_list); + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&delayed_node->mutex); + /* + * This delayed node is still cached in the btrfs inode, so refs + * must be > 1 now, and we needn't check it is going to be freed + * or not. + * + * Besides that, this function is used to read dir, we do not + * insert/delete delayed items in this period. So we also needn't + * requeue or dequeue this delayed node. + */ + atomic_dec(&delayed_node->refs); +} + +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_item *curr, *next; + + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } +} + +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index) +{ + struct btrfs_delayed_item *curr, *next; + int ret; + + if (list_empty(del_list)) + return 0; + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + if (curr->key.offset > index) + break; + + list_del(&curr->readdir_list); + ret = (curr->key.offset == index); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (ret) + return 1; + else + continue; + } + return 0; +} + +/* + * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree + * + */ +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list) +{ + struct btrfs_dir_item *di; + struct btrfs_delayed_item *curr, *next; + struct btrfs_key location; + char *name; + int name_len; + int over = 0; + unsigned char d_type; + + if (list_empty(ins_list)) + return 0; + + /* + * Changing the data of the delayed item is impossible. So + * we needn't lock them. And we have held i_mutex of the + * directory, nobody can delete any directory indexes now. + */ + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + + if (curr->key.offset < ctx->pos) { + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + continue; + } + + ctx->pos = curr->key.offset; + + di = (struct btrfs_dir_item *)curr->data; + name = (char *)(di + 1); + name_len = btrfs_stack_dir_name_len(di); + + d_type = btrfs_filetype_table[di->type]; + btrfs_disk_key_to_cpu(&location, &di->location); + + over = !dir_emit(ctx, name, name_len, + location.objectid, d_type); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (over) + return 1; + } + return 0; +} + +static void fill_stack_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_inode_item *inode_item, + struct inode *inode) +{ + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); + btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); + btrfs_set_stack_inode_generation(inode_item, + BTRFS_I(inode)->generation); + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); + btrfs_set_stack_inode_transid(inode_item, trans->transid); + btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); + btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + btrfs_set_stack_inode_block_group(inode_item, 0); + + btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + inode->i_atime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + inode->i_atime.tv_nsec); + + btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + inode->i_mtime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + inode->i_mtime.tv_nsec); + + btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + inode->i_ctime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + inode->i_ctime.tv_nsec); +} + +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + int ret = 0; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + goto release_node; + } + + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); + if (ret) + goto release_node; + + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return ret; +} + +int btrfs_delayed_delete_inode_ref(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_root *root = delayed_node->root; + struct btrfs_delayed_item *curr_item, *prev_item; + + mutex_lock(&delayed_node->mutex); + curr_item = __btrfs_first_delayed_insertion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { + btrfs_delayed_inode_release_metadata(root, delayed_node); + btrfs_release_delayed_inode(delayed_node); + } + mutex_unlock(&delayed_node->mutex); +} + +void btrfs_kill_delayed_inode_items(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + __btrfs_kill_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node); +} + +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) +{ + u64 inode_id = 0; + struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; + + while (1) { + spin_lock(&root->inode_lock); + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { + spin_unlock(&root->inode_lock); + break; + } + + inode_id = delayed_nodes[n - 1]->inode_id + 1; + + for (i = 0; i < n; i++) + atomic_inc(&delayed_nodes[i]->refs); + spin_unlock(&root->inode_lock); + + for (i = 0; i < n; i++) { + __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i]); + } + } +} + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h new file mode 100644 index 00000000000..f70119f2542 --- /dev/null +++ b/fs/btrfs/delayed-inode.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie <miaox@cn.fujitsu.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DELAYED_TREE_OPERATION_H +#define __DELAYED_TREE_OPERATION_H + +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/wait.h> +#include <linux/atomic.h> + +#include "ctree.h" + +/* types of the delayed item */ +#define BTRFS_DELAYED_INSERTION_ITEM 1 +#define BTRFS_DELAYED_DELETION_ITEM 2 + +struct btrfs_delayed_root { + spinlock_t lock; + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ + int nodes; /* for delayed nodes */ + wait_queue_head_t wait; +}; + +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + +struct btrfs_delayed_node { + u64 inode_id; + u64 bytes_reserved; + struct btrfs_root *root; + /* Used to add the node into the delayed root's node list. */ + struct list_head n_list; + /* + * Used to add the node into the prepare list, the nodes in this list + * is waiting to be dealt with by the async worker. + */ + struct list_head p_list; + struct rb_root ins_root; + struct rb_root del_root; + struct mutex mutex; + struct btrfs_inode_item inode_item; + atomic_t refs; + u64 index_cnt; + unsigned long flags; + int count; +}; + +struct btrfs_delayed_item { + struct rb_node rb_node; + struct btrfs_key key; + struct list_head tree_list; /* used for batch insert/delete items */ + struct list_head readdir_list; /* used for readdir items */ + u64 bytes_reserved; + struct btrfs_delayed_node *delayed_node; + atomic_t refs; + int ins_or_del; + u32 data_len; + char data[0]; +}; + +static inline void btrfs_init_delayed_root( + struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index); + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index); + +int btrfs_inode_delayed_dir_index_count(struct inode *inode); + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr); + +void btrfs_balance_delayed_items(struct btrfs_root *root); + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode); +/* Used for evicting the inode. */ +void btrfs_remove_delayed_node(struct inode *inode); +void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode); + + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct inode *inode); + +/* Used for drop dead root */ +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); + +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); + +/* Used for readdir() */ +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list); +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list); +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index); +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + struct list_head *ins_list); + +/* for init */ +int __init btrfs_delayed_inode_init(void); +void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 902ce507c4e..6d16bea94e1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@ #include "delayed-ref.h" #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for @@ -36,9 +40,9 @@ * compare two delayed tree backrefs with same bytenr and type */ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1) + struct btrfs_delayed_tree_ref *ref1, int type) { - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (type == BTRFS_TREE_BLOCK_REF_KEY) { if (ref1->root < ref2->root) return -1; if (ref1->root > ref2->root) @@ -85,7 +89,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, * type of the delayed backrefs and content of delayed backrefs. */ static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1) + struct btrfs_delayed_ref_node *ref1, + bool compare_seq) { if (ref1->bytenr < ref2->bytenr) return -1; @@ -101,10 +106,22 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; + /* merging of sequenced refs is not allowed */ + if (compare_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1)); + btrfs_delayed_node_to_tree_ref(ref1), + ref1->type); } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), @@ -134,7 +151,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, ins); + cmp = comp_entry(entry, ins, 1); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -148,40 +165,72 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return NULL; } +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->node.bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->node.bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + /* * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot + * head if it was able to find one, or NULL if nothing was in that spot. + * If return_bigger is given, the next bigger entry is returned if no exact + * match is found. */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last) +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + int return_bigger) { - struct rb_node *n = root->rb_node; - struct btrfs_delayed_ref_node *entry; - int cmp; + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + n = root->rb_node; + entry = NULL; while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); - if (last) - *last = entry; - - if (bytenr < entry->bytenr) - cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) - cmp = 1; - else - cmp = 0; + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (cmp < 0) + if (bytenr < entry->node.bytenr) n = n->rb_left; - else if (cmp > 0) + else if (bytenr > entry->node.bytenr) n = n->rb_right; else return entry; } + if (entry && return_bigger) { + if (bytenr > entry->node.bytenr) { + n = rb_next(&entry->href_node); + if (!n) + n = rb_first(root); + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + return entry; + } + return entry; + } return NULL; } @@ -209,214 +258,188 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) +static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref) { - int count = 0; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - - delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + rb_erase(&head->href_node, &delayed_refs->href_root); } else { - ref = NULL; - find_ref_head(&delayed_refs->root, start, &ref); - if (ref) { - struct btrfs_delayed_ref_node *tmp; - - node = rb_prev(&ref->rb_node); - while (node) { - tmp = rb_entry(node, - struct btrfs_delayed_ref_node, - rb_node); - if (tmp->bytenr < start) - break; - ref = tmp; - node = rb_prev(&ref->rb_node); - } - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); + assert_spin_locked(&head->lock); + rb_erase(&ref->rb_node, &head->ref_root); } -again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; + ref->in_tree = 0; + btrfs_put_delayed_ref(ref); + atomic_dec(&delayed_refs->num_entries); + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; +} + +static int merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, u64 seq) +{ + struct rb_node *node; + int mod = 0; + int done = 0; + + node = rb_next(&ref->rb_node); + while (!done && node) { + struct btrfs_delayed_ref_node *next; + + next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + if (seq && next->seq >= seq) + break; + if (comp_entry(ref, next, 0)) + continue; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + struct btrfs_delayed_ref_node *tmp; + + tmp = ref; + ref = next; + next = tmp; + done = 1; } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = 1; + } else { + /* + * You can't have multiples of the same ref on a tree + * block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ - start = 0; - node = rb_first(&delayed_refs->root); - goto again; } - return 1; + return done; } -/* - * This checks to see if there are any delayed refs in the - * btree for a given bytenr. It returns one if it finds any - * and zero otherwise. - * - * If it only finds a head node, it returns 0. - * - * The idea is to use this when deciding if you can safely delete an - * extent from the extent allocation tree. There may be a pending - * ref in the rbtree that adds or removes references, so as long as this - * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent - * allocation tree. - */ -int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *prev_node; - int ret = 0; + struct rb_node *node; + u64 seq = 0; - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); + assert_spin_locked(&head->lock); + /* + * We don't have too much refs to merge in the case of delayed data + * refs. + */ + if (head->is_data) + return; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) { - prev_node = rb_prev(&ref->rb_node); - if (!prev_node) - goto out; - ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + node = rb_first(&head->ref_root); + while (node) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) + /* We can't merge refs that are outside of our seq count */ + if (seq && ref->seq >= seq) + break; + if (merge_ref(trans, delayed_refs, head, ref, seq)) + node = rb_first(&head->ref_root); + else + node = rb_next(&ref->rb_node); + } +} + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + int ret = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n", + (u32)(seq >> 32), (u32)seq, + (u32)(elem->seq >> 32), (u32)elem->seq, + delayed_refs); ret = 1; + } } -out: - spin_unlock(&delayed_refs->lock); + + spin_unlock(&fs_info->tree_mod_seq_lock); return ret; } -/* - * helper function to lookup reference count and flags of extent. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans) { - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + struct btrfs_delayed_ref_head *head; + u64 start; + bool loop = false; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; delayed_refs = &trans->transaction->delayed_refs; + again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; + start = delayed_refs->run_delayed_start; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head && !loop) { + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + head = find_ref_head(&delayed_refs->href_root, start, 1); + if (!head) + return NULL; + } else if (!head && loop) { + return NULL; } - spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) { - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); + while (head->processing) { + struct rb_node *node; - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); + node = rb_next(&head->href_node); + if (!node) { + if (loop) + return NULL; + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; goto again; } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); } - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out: - spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); - return ret; + + head->processing = 1; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->node.bytenr + + head->node.num_bytes; + return head; } /* @@ -430,6 +453,7 @@ out: static noinline void update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { @@ -441,18 +465,11 @@ update_existing_ref(struct btrfs_trans_handle *trans, * every changing the extent allocation tree. */ existing->ref_mod--; - if (existing->ref_mod == 0) { - rb_erase(&existing->rb_node, - &delayed_refs->root); - existing->in_tree = 0; - btrfs_put_delayed_ref(existing); - delayed_refs->num_entries--; - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; - } else { + if (existing->ref_mod == 0) + drop_delayed_ref(trans, delayed_refs, head, existing); + else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - } } else { WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); @@ -482,6 +499,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref = btrfs_delayed_node_to_head(update); BUG_ON(existing_ref->is_data != ref->is_data); + spin_lock(&existing_ref->lock); if (ref->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -515,13 +533,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref->extent_op->flags_to_set; existing_ref->extent_op->update_flags = 1; } - kfree(ref->extent_op); + btrfs_free_delayed_extent_op(ref->extent_op); } } /* - * update the reference mod on the head to reflect this new operation + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. */ existing->ref_mod += update->ref_mod; + spin_unlock(&existing_ref->lock); } /* @@ -529,12 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, int action, int is_data) { - struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -576,47 +598,59 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, ref->action = 0; ref->is_head = 1; ref->in_tree = 1; + ref->seq = 0; head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->ref_root = RB_ROOT; + head_ref->processing = 0; - INIT_LIST_HEAD(&head_ref->cluster); + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); + trace_add_delayed_ref_head(ref, head_ref, action); + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); if (existing) { - update_existing_head_ref(existing, ref); + update_existing_head_ref(&existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } - return 0; + return head_ref; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action) +static noinline void +add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, int level, + int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ @@ -627,51 +661,61 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->action = action; ref->is_head = 0; ref->in_tree = 1; + ref->no_quota = no_quota; + ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_TREE_BLOCK_REF_KEY; - } full_ref->level = level; - existing = tree_insert(&delayed_refs->root, &ref->rb_node); + trace_add_delayed_tree_ref(ref, full_ref, action); + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } - return 0; + spin_unlock(&head_ref->lock); } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action) +static noinline void +add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, u64 owner, + u64 offset, int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; delayed_refs = &trans->transaction->delayed_refs; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; @@ -680,32 +724,37 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, ref->action = action; ref->is_head = 0; ref->in_tree = 1; + ref->no_quota = no_quota; + ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_DATA_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_EXTENT_DATA_REF_KEY; - } + full_ref->objectid = owner; full_ref->offset = offset; - existing = tree_insert(&delayed_refs->root, &ref->rb_node); + trace_add_delayed_data_ref(ref, full_ref, action); + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } - return 0; + spin_unlock(&head_ref->lock); } /* @@ -713,24 +762,28 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); return -ENOMEM; } @@ -743,39 +796,43 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 0); - BUG_ON(ret); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 0); - ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, level, action); - BUG_ON(ret); + add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, + num_bytes, parent, ref_root, level, action, + no_quota); spin_unlock(&delayed_refs->lock); + return 0; } /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); return -ENOMEM; } @@ -788,26 +845,26 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 1); - BUG_ON(ret); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 1); - ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, owner, offset, action); - BUG_ON(ret); + add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, + num_bytes, parent, ref_root, owner, offset, + action, no_quota); spin_unlock(&delayed_refs->lock); + return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; @@ -816,10 +873,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - BUG_ON(ret); spin_unlock(&delayed_refs->lock); return 0; @@ -833,88 +889,56 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) { - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; + return find_ref_head(&delayed_refs->href_root, bytenr, 0); } -/* - * add a delayed ref to the tree. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - * - * The main point of this call is to add and remove a backreference in a single - * shot, taking the lock only once, and only searching for the head node once. - * - * It is the same as doing a ref add and delete in two separate calls. - */ -#if 0 -int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 orig_parent, - u64 parent, u64 orig_ref_root, u64 ref_root, - u64 orig_ref_generation, u64 ref_generation, - u64 owner_objectid, int pin) +void btrfs_delayed_ref_exit(void) { - struct btrfs_delayed_ref *ref; - struct btrfs_delayed_ref *old_ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); - if (!old_ref) { - kfree(ref); - return -ENOMEM; - } - - /* - * the parent = 0 case comes from cases where we don't actually - * know the parent yet. It will get updated later via a add/drop - * pair. - */ - if (parent == 0) - parent = bytenr; - if (orig_parent == 0) - orig_parent = bytenr; + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - kfree(old_ref); - return -ENOMEM; - } - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, - (u64)-1, 0, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, 0); - BUG_ON(ret); - - ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, ref_generation, - owner_objectid, BTRFS_ADD_DELAYED_REF, 0); - BUG_ON(ret); - - ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, - orig_parent, orig_ref_root, - orig_ref_generation, owner_objectid, - BTRFS_DROP_DELAYED_REF, pin); - BUG_ON(ret); - spin_unlock(&delayed_refs->lock); return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; } -#endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f6fc67ddad3..a764e2340d4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,7 +18,7 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ -/* these are the possible values of struct btrfs_delayed_ref->action */ +/* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ @@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node { /* the size of the extent */ u64 num_bytes; + /* seq number to keep track of insertion order */ + u64 seq; + /* ref count on this data structure */ atomic_t refs; @@ -49,6 +52,7 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -57,6 +61,7 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; u64 flags_to_set; + int level; unsigned int update_key:1; unsigned int update_flags:1; unsigned int is_data:1; @@ -77,7 +82,10 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; - struct list_head cluster; + spinlock_t lock; + struct rb_root ref_root; + + struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; /* @@ -94,29 +102,27 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int processing:1; }; struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; int level; }; struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; u64 objectid; u64 offset; }; struct btrfs_delayed_ref_root { - struct rb_root root; + /* head ref rbtree */ + struct rb_root href_root; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -124,7 +130,7 @@ struct btrfs_delayed_ref_root { /* how many delayed ref updates we've queued, used by the * throttling code */ - unsigned long num_entries; + atomic_t num_entries; /* total number of head nodes in tree */ unsigned long num_heads; @@ -142,43 +148,89 @@ struct btrfs_delayed_ref_root { u64 run_delayed_start; }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(atomic_read(&ref->refs) == 0); if (atomic_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); - kfree(ref); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } } } -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int no_quota); +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int no_quota); +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); -int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 orig_parent, - u64 parent, u64 orig_ref_root, u64 ref_root, - u64 orig_ref_generation, u64 ref_generation, - u64 owner_objectid, int pin); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans); + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 00000000000..eea26e1b2fd --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,932 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <asm/div64.h> +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "sysfs.h" + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + btrfs_warn(fs_info, "insert dev_replace item failed %d!", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + if (btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); + return -EOPNOTSUPP; + } + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + &tgt_device); + if (ret) { + btrfs_err(fs_info, "target device %s is invalid!", + args->start.tgtdev_name); + mutex_unlock(&fs_info->volume_mutex); + return -EINVAL; + } + + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) { + ret = -EINVAL; + goto leave_no_lock; + } + + if (tgt_device->total_bytes < src_device->total_bytes) { + btrfs_err(fs_info, "target device is smaller than source device!"); + ret = -EINVAL; + goto leave_no_lock; + } + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = get_seconds(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_roots(root->fs_info, -1); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + src_device->total_bytes, + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + WARN_ON(ret); + + return 0; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + s64 writers; + DEFINE_WAIT(wait); + + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + do { + prepare_to_wait(&fs_info->replace_wait, &wait, + TASK_UNINTERRUPTIBLE); + writers = percpu_counter_sum(&fs_info->bio_counter); + if (writers) + schedule(); + finish_wait(&fs_info->replace_wait, &wait); + } while (writers); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } + btrfs_wait_ordered_roots(root->fs_info, -1); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { + printk_in_rcu(KERN_ERR + "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; + } + + printk_in_rcu(KERN_INFO + "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + tgt_device->bytes_used = src_device->bytes_used; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + + btrfs_rm_dev_replace_blocked(fs_info); + + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + + btrfs_rm_dev_replace_unblocked(fs_info); + + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, + div64_u64(dev_replace->srcdev->total_bytes, 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = get_seconds(); + dev_replace->item_needs_writeback = 1; + btrfs_info(fs_info, "suspending dev_replace for unmount"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_ERR_OR_ZERO(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + do_div(progress, 10); + printk_in_rcu(KERN_INFO + "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "<missing disk>" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "<missing target disk>", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + dev_replace->srcdev->total_bytes, + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + percpu_counter_dec(&fs_info->bio_counter); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + DEFINE_WAIT(wait); +again: + percpu_counter_inc(&fs_info->bio_counter); + if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + goto again; + } + +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 00000000000..20035cbbf02 --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e9103b3baa4..a0691df5dce 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,10 @@ #include "hash.h" #include "transaction.h" +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); + /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -49,14 +53,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - ret = btrfs_extend_item(trans, root, path, data_size); - WARN_ON(ret > 0); - } - if (ret < 0) + btrfs_extend_item(root, path, data_size); + } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); BUG_ON(data_size > btrfs_item_size(leaf, item)); ptr += btrfs_item_size(leaf, item) - data_size; @@ -90,13 +92,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); - /* - * FIXME: at some point we should handle xattr's that are larger than - * what we can fit in our leaf. We set location to NULL b/c we arent - * pointing at anything else, that will change if we store the xattr - * data in a separate inode. - */ - BUG_ON(IS_ERR(dir_item)); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; @@ -122,10 +119,12 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). + * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, const char *name, int name_len, u64 dir, - struct btrfs_key *location, u8 type, u64 index) + *root, const char *name, int name_len, + struct inode *dir, struct btrfs_key *location, + u8 type, u64 index) { int ret = 0; int ret2 = 0; @@ -137,13 +136,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_disk_key disk_key; u32 data_size; - key.objectid = dir; + key.objectid = btrfs_ino(dir); btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; path->leave_spinning = 1; + btrfs_cpu_key_to_disk(&disk_key, location); + data_size = sizeof(*dir_item) + name_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); @@ -151,11 +154,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; - goto out; + goto out_free; } leaf = path->nodes[0]; - btrfs_cpu_key_to_disk(&disk_key, location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); @@ -170,29 +172,13 @@ second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; - goto out; + goto out_free; } - btrfs_release_path(root, path); + btrfs_release_path(path); - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); - key.offset = index; - dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); - if (IS_ERR(dir_item)) { - ret2 = PTR_ERR(dir_item); - goto out; - } - leaf = path->nodes[0]; - btrfs_cpu_key_to_disk(&disk_key, location); - btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); - btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); - btrfs_set_dir_transid(leaf, dir_item, trans->transid); - name_ptr = (unsigned long)(dir_item + 1); - write_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_mark_buffer_dirty(leaf); -out: + ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir, + &disk_key, type, index); +out_free: btrfs_free_path(path); if (ret) return ret; @@ -216,8 +202,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); @@ -227,21 +211,69 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); + if (ret > 0) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; + ret = 0; + goto out; } - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || - found_key.offset != key.offset) - return NULL; + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } - return btrfs_match_dir_item_name(root, path, name, name_len); + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len; + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; } /* @@ -333,8 +365,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); @@ -342,18 +372,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); - if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || - found_key.offset != key.offset) + if (ret > 0) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); @@ -364,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) { @@ -377,6 +396,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + @@ -424,8 +446,41 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - ret = btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(root, path, item_len - sub_item_len, 1); } + return ret; +} + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + btrfs_crit(root->fs_info, "invalid dir item type: %d", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + btrfs_crit(root->fs_info, "invalid dir item name len: %u", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if ((btrfs_dir_data_len(leaf, dir_item) + + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { + btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u", + (unsigned)btrfs_dir_name_len(leaf, dir_item), + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b8f2c89cc..08e65e9cf2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,11 +26,15 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/crc32c.h> #include <linux/slab.h> -#include "compat.h" +#include <linux/migrate.h> +#include <linux/ratelimit.h> +#include <linux/uuid.h> +#include <linux/semaphore.h> +#include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -39,12 +43,36 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "inode-map.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "raid56.h" +#include "sysfs.h" +#include "qgroup.h" + +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); - -static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root); +static void btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); +static void btrfs_error_commit_super(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -76,41 +104,93 @@ struct async_submit_bio { int rw; int mirror_num; unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; struct btrfs_work work; + int error; }; -/* These are used to set the lockdep class on the extent buffer locks. - * The class is set by the readpage_end_io_hook after the buffer has - * passed csum validation but before the pages are unlocked. +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid. This ensures that all special purpose roots + * have separate keysets. * - * The lockdep class is also set by btrfs_init_new_buffer on freshly - * allocated blocks. + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. * - * The class is based on the level in the tree block, which allows lockdep - * to know that lower nodes nest inside the locks of higher nodes. + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. * - * We also add a check to make sure the highest level of the tree is - * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this - * code needs update as well. + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # if BTRFS_MAX_LEVEL != 8 # error # endif -static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; -static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { - /* leaf */ - "btrfs-extent-00", - "btrfs-extent-01", - "btrfs-extent-02", - "btrfs-extent-03", - "btrfs-extent-04", - "btrfs-extent-05", - "btrfs-extent-06", - "btrfs-extent-07", - /* highest possible level */ - "btrfs-extent-08", + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + const char *name_stem; /* lock name stem */ + char names[BTRFS_MAX_LEVEL + 1][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, + { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, + { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, + { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, + { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, + { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = 0, .name_stem = "tree" }, }; + +void __init btrfs_init_lockdep(void) +{ + int i, j; + + /* initialize lockdep class names */ + for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { + struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + + for (j = 0; j < ARRAY_SIZE(ks->names); j++) + snprintf(ks->names[j], sizeof(ks->names[j]), + "btrfs-%s-%02d", ks->name_stem, j); + } +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, + int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, + &ks->keys[level], ks->names[level]); +} + #endif /* @@ -118,7 +198,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { * that covers the entire device */ static struct extent_map *btree_get_extent(struct inode *inode, - struct page *page, size_t page_offset, u64 start, u64 len, + struct page *page, size_t pg_offset, u64 start, u64 len, int create) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; @@ -135,7 +215,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, } read_unlock(&em_tree->lock); - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); if (!em) { em = ERR_PTR(-ENOMEM); goto out; @@ -147,40 +227,30 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; - } + if (!em) + em = ERR_PTR(-EIO); } else if (ret) { free_extent_map(em); - em = NULL; + em = ERR_PTR(ret); } write_unlock(&em_tree->lock); - if (ret) - em = ERR_PTR(ret); out: return em; } -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +u32 btrfs_csum_data(char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) { - *(__le32 *)result = ~cpu_to_le32(crc); + put_unaligned_le32(~crc, result); } /* @@ -190,13 +260,11 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; - char *map_token = NULL; char *kaddr; unsigned long map_start; unsigned long map_len; @@ -207,16 +275,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, len = buf->len - offset; while (len > 0) { err = map_private_extent_buffer(buf, offset, 32, - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); len -= cur_len; offset += cur_len; - unmap_extent_buffer(buf, map_token, KM_USER0); } if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size * sizeof(char), GFP_NOFS); @@ -235,14 +301,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs: %s checksum verify " - "failed on %llu wanted %X found %X " - "level %d\n", - root->fs_info->sb->s_id, - (unsigned long long)buf->start, val, found, - btrfs_header_level(buf)); - } + printk_ratelimited(KERN_INFO + "BTRFS: %s checksum verify failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -262,33 +325,96 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, * in the wrong place. */ static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid) + struct extent_buffer *eb, u64 parent_transid, + int atomic) { struct extent_state *cached_state = NULL; int ret; + bool need_lock = (current->journal_info == + (void *)BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; + if (atomic) + return -EAGAIN; + + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state, GFP_NOFS); - if (extent_buffer_uptodate(io_tree, eb, cached_state) && + 0, &cached_state); + if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; } - if (printk_ratelimit()) { - printk("parent transid verify failed on %llu wanted %llu " + printk_ratelimited("parent transid verify failed on %llu wanted %llu " "found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); - } + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(io_tree, eb, &cached_state); + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); + return ret; +} + +/* + * Return 0 if the superblock checksum type matches the checksum value of that + * algorithm. Pass the raw disk superblock data. + */ +static int btrfs_check_super_csum(char *raw_disk_sb) +{ + struct btrfs_super_block *disk_sb = + (struct btrfs_super_block *)raw_disk_sb; + u16 csum_type = btrfs_super_csum_type(disk_sb); + int ret = 0; + + if (csum_type == BTRFS_CSUM_TYPE_CRC32) { + u32 crc = ~(u32)0; + const int csum_size = sizeof(crc); + char result[csum_size]; + + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space + * is filled with zeros and is included in the checkum. + */ + crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, + crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, result); + + if (memcmp(raw_disk_sb, result, csum_size)) + ret = 1; + + if (ret && btrfs_super_generation(disk_sb) < 10) { + printk(KERN_WARNING + "BTRFS: super block crcs don't match, older mkfs detected\n"); + ret = 0; + } + } + + if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", + csum_type); + ret = 1; + } + return ret; } @@ -301,28 +427,56 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; + int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; + int failed_mirror = 0; + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && - !verify_parent_transid(io_tree, eb, parent_transid)) - return ret; + if (!ret) { + if (!verify_parent_transid(io_tree, eb, + parent_transid, 0)) + break; + else + ret = -EIO; + } - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + break; + + num_copies = btrfs_num_copies(root->fs_info, eb->start, eb->len); if (num_copies == 1) - return ret; + break; + + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + if (mirror_num > num_copies) - return ret; + break; } - return -EIO; + + if (failed && !ret && failed_mirror) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; } /* @@ -332,46 +486,17 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 found_start; - int found_level; - unsigned long len; struct extent_buffer *eb; - int ret; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) - goto out; - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, - btrfs_header_generation(eb)); - BUG_ON(ret); + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; found_start = btrfs_header_bytenr(eb); - if (found_start != start) { - WARN_ON(1); - goto err; - } - if (eb->first_page != page) { - WARN_ON(1); - goto err; - } - if (!PageUptodate(page)) { - WARN_ON(1); - goto err; - } - found_level = btrfs_header_level(eb); - + if (WARN_ON(found_start != start || !PageUptodate(page))) + return 0; csum_tree_block(root, eb, 0); -err: - free_extent_buffer(eb); -out: return 0; } @@ -382,8 +507,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), - BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; @@ -394,79 +518,179 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +#define CORRUPT(reason, eb, root, slot) \ + btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d", reason, \ + btrfs_header_bytenr(eb), root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) { - lockdep_set_class_and_name(&eb->lock, - &btrfs_eb_class[level], - btrfs_eb_name[level]); + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; } -#endif -static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) +static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { - struct extent_io_tree *tree; u64 found_start; int found_level; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; + int reads_done; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; if (!page->private) goto out; - len = page->private >> 2; - WARN_ON(len == 0); + eb = (struct extent_buffer *)page->private; - eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); - found_start = btrfs_header_bytenr(eb); - if (found_start != start) { - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs bad tree block start " - "%llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); - } + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; goto err; } - if (eb->first_page != page) { - printk(KERN_INFO "btrfs bad first page %lu %lu\n", - eb->first_page->index, page->index); - WARN_ON(1); + + found_start = btrfs_header_bytenr(eb); + if (found_start != eb->start) { + printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " + "%llu %llu\n", + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); - } + printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", + eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); + if (found_level >= BTRFS_MAX_LEVEL) { + btrfs_info(root->fs_info, "bad tree block level %d", + (int)btrfs_header_level(eb)); + ret = -EIO; + goto err; + } - btrfs_set_buffer_lockdep_class(eb, found_level); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), + eb, found_level); ret = csum_tree_block(root, eb, 1); - if (ret) + if (ret) { ret = -EIO; + goto err; + } - end = min_t(u64, eb->len, PAGE_CACHE_SIZE); - end = eb->start + end - 1; + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } + + if (!ret) + set_extent_buffer_uptodate(eb); err: + if (reads_done && + test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(root, eb, eb->start, ret); + + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + } free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct page *page, int failed_mirror) +{ + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(root, eb, eb->start, -EIO); + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -474,26 +698,42 @@ static void end_workqueue_bio(struct bio *bio, int err) fs_info = end_io_wq->info; end_io_wq->error = err; - end_io_wq->work.func = end_workqueue_fn; - end_io_wq->work.flags = 0; - - if (bio->bi_rw & (1 << BIO_RW)) { - if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_write_workers, - &end_io_wq->work); + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); + + if (bio->bi_rw & REQ_WRITE) { + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) + btrfs_queue_work(fs_info->endio_meta_write_workers, + &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) + btrfs_queue_work(fs_info->endio_freespace_worker, + &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_write_workers, + &end_io_wq->work); } else { - if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); + else if (end_io_wq->metadata) + btrfs_queue_work(fs_info->endio_meta_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_workers, + &end_io_wq->work); } } +/* + * For the metadata arg you want + * + * 0 - if data + * 1 - if normal metadta + * 2 - if writing to the free space cache area + * 3 - raid parity work + */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) { @@ -517,26 +757,22 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) { unsigned long limit = min_t(unsigned long, - info->workers.max_workers, + info->thread_pool_size, info->fs_devices->open_devices); return 256 * limit; } -int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) -{ - return atomic_read(&info->nr_async_bios) > - btrfs_async_submit_limit(info); -} - static void run_one_async_start(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; + int ret; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; - async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + ret = async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); + if (ret) + async->error = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -551,14 +787,19 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); + /* If an error occured we just want to clean up the bio and move on */ + if (async->error) { + bio_endio(async->bio, async->error); + return; + } + async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -572,6 +813,7 @@ static void run_one_async_free(struct btrfs_work *work) int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done) { @@ -588,19 +830,20 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - async->work.func = run_one_async_start; - async->work.ordered_func = run_one_async_done; - async->work.ordered_free = run_one_async_free; + btrfs_init_work(&async->work, run_one_async_start, + run_one_async_done, run_one_async_free); - async->work.flags = 0; async->bio_flags = bio_flags; + async->bio_offset = bio_offset; + + async->error = 0; atomic_inc(&fs_info->nr_async_submits); - if (rw & (1 << BIO_RW_SYNCIO)) - btrfs_set_work_high_prio(&async->work); + if (rw & REQ_SYNC) + btrfs_set_work_high_priority(&async->work); - btrfs_queue_worker(&fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -613,176 +856,202 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, static int btree_csum_one_bio(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; struct btrfs_root *root; + int i, ret = 0; - WARN_ON(bio->bi_vcnt <= 0); - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - csum_dirty_buffer(root, bvec->bv_page); - bio_index++; - bvec++; + ret = csum_dirty_buffer(root, bvec->bv_page); + if (ret) + break; } - return 0; + + return ret; } static int __btree_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - btree_csum_one_bio(bio); - return 0; + return btree_csum_one_bio(bio); } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + int ret; + /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; +} + +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + int async = check_async_write(inode, bio_flags); int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); - - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + if (ret) + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); } - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - __btree_submit_bio_start, - __btree_submit_bio_done); + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; } -static int btree_writepage(struct page *page, struct writeback_control *wbc) +#ifdef CONFIG_MIGRATION +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) { - struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); - WARN_ON(!eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); - } - free_extent_buffer(eb); - - unlock_page(page); - return 0; + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + return migrate_page(mapping, newpage, page, mode); } +#endif + static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; - tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info; + int ret; + if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } - return extent_writepages(tree, mapping, btree_get_extent, wbc); + return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - ret = try_release_extent_buffer(tree, page); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - - return ret; + return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned long offset) +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); if (PagePrivate(page)) { - printk(KERN_WARNING "btrfs warning page private not zero " - "on page %llu\n", (unsigned long long)page_offset(page)); + btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, + "page private not zero on page %llu", + (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); } } +static int btree_set_page_dirty(struct page *page) +{ +#ifdef DEBUG + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); +#endif + return __set_page_dirty_nobuffers(page); +} + static const struct address_space_operations btree_aops = { .readpage = btree_readpage, - .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, - .sync_page = block_sync_page, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif + .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -796,42 +1065,70 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(buf)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, GFP_NOFS); - return eb; + return find_extent_buffer(root->fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL, GFP_NOFS); - return eb; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return alloc_test_extent_buffer(root->fs_info, bytenr, + blocksize); +#endif + return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->first_page->mapping, + return filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } @@ -839,53 +1136,71 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_io_tree *io_tree; int ret; - io_tree = &BTRFS_I(btree_inode)->io_tree; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + if (ret) { + free_extent_buffer(buf); + return NULL; + } return buf; } -int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) +void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) { - struct inode *btree_inode = root->fs_info->btree_inode; + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); } + } +} + +static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) +{ + struct btrfs_subvolume_writers *writers; + int ret; - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + writers = kmalloc(sizeof(*writers), GFP_NOFS); + if (!writers) + return ERR_PTR(-ENOMEM); + + ret = percpu_counter_init(&writers->counter, 0); + if (ret < 0) { + kfree(writers); + return ERR_PTR(ret); } - return 0; + + init_waitqueue_head(&writers->wait); + return writers; +} + +static void +btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) +{ + percpu_counter_destroy(&writers->counter); + kfree(writers); } -static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, - u64 objectid) +static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) { root->node = NULL; root->commit_root = NULL; @@ -893,117 +1208,178 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->nodesize = nodesize; root->leafsize = leafsize; root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->clean_orphans = 0; + root->state = 0; + root->orphan_cleanup_state = 0; - root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); - INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); - spin_lock_init(&root->node_lock); - spin_lock_init(&root->list_lock); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); + spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); + spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); - root->log_batch = 0; + atomic_set(&root->log_batch, 0); + atomic_set(&root->orphan_inodes, 0); + atomic_set(&root->refs, 1); + atomic_set(&root->will_be_snapshoted, 0); root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping, GFP_NOFS); + if (fs_info) + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); - root->defrag_trans_start = fs_info->generation; + if (fs_info) + root->defrag_trans_start = fs_info->generation; + else + root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); - root->defrag_running = 0; root->root_key.objectid = objectid; - root->anon_super.s_root = NULL; - root->anon_super.s_dev = 0; - INIT_LIST_HEAD(&root->anon_super.s_list); - INIT_LIST_HEAD(&root->anon_super.s_instances); - init_rwsem(&root->anon_super.s_umount); + root->anon_dev = 0; - return 0; + spin_lock_init(&root->root_item_lock); } -static int find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { - int ret; - u32 blocksize; - u64 generation; + struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + if (root) + root->fs_info = fs_info; + return root; +} - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - &root->root_item, &root->root_key); - if (ret > 0) - return -ENOENT; - BUG_ON(ret); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +/* Should only be used by the testing infrastructure */ +struct btrfs_root *btrfs_alloc_dummy_root(void) +{ + struct btrfs_root *root; - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - BUG_ON(!root->node); - root->commit_root = btrfs_root_node(root); - return 0; + root = btrfs_alloc_root(NULL); + if (!root) + return ERR_PTR(-ENOMEM); + __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; + + return root; } +#endif -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid) { - struct extent_buffer *eb; - struct btrfs_root *log_root_tree = fs_info->log_root_tree; - u64 start = 0; - u64 end = 0; - int ret; + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + int ret = 0; + uuid_le uuid; - if (!log_root_tree) - return 0; + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); - while (1) { - ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); - if (ret) - break; + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; - clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + leaf = NULL; + goto fail; } - eb = fs_info->log_root_tree->node; - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) != 0); + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + root->node = leaf; - ret = btrfs_free_reserved_extent(fs_info->tree_root, - eb->start, eb->len); - BUG_ON(ret); + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); - free_extent_buffer(eb); - kfree(fs_info->log_root_tree); - fs_info->log_root_tree = NULL; - return 0; + root->commit_root = btrfs_root_node(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_item.flags = 0; + root->root_item.byte_limit = 0; + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + uuid_le_gen(&uuid); + memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + root->root_item.drop_level = 0; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + btrfs_tree_unlock(leaf); + + return root; + +fail: + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); + free_extent_buffer(leaf); + } + kfree(root); + + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, @@ -1013,7 +1389,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); @@ -1024,16 +1400,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * DON'T set REF_COWS for log trees + * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ - root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + BTRFS_TREE_LOG_OBJECTID, NULL, + 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1047,8 +1426,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->node = leaf; write_extent_buffer(root->node, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(root->node), - BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1081,96 +1459,169 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, log_root->root_key.offset = root->root_key.objectid; inode_item = &log_root->root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) +static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; - struct extent_buffer *l; u64 generation; u32 blocksize; - int ret = 0; + int ret; - root = kzalloc(sizeof(*root), GFP_NOFS); - if (!root) + path = btrfs_alloc_path(); + if (!path) return ERR_PTR(-ENOMEM); - if (location->offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location->objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; } __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, - root, fs_info, location->objectid); + root, fs_info, key->objectid); - path = btrfs_alloc_path(); - BUG_ON(!path); - ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret == 0) { - l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); - memcpy(&root->root_key, location, sizeof(*location)); - } - btrfs_free_path(path); + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; - return ERR_PTR(ret); + goto find_fail; } generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + if (!root->node) { + ret = -ENOMEM; + goto find_fail; + } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto read_fail; + } root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) - root->ref_cows = 1; - + btrfs_free_path(path); return root; + +read_fail: + free_extent_buffer(root->node); +find_fail: + kfree(root); +alloc_fail: + root = ERR_PTR(ret); + goto out; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid) +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location) { struct btrfs_root *root; - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; - if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; + root = btrfs_read_tree_root(tree_root, location); + if (IS_ERR(root)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); + } + + return root; +} +int btrfs_init_fs_root(struct btrfs_root *root) +{ + int ret; + struct btrfs_subvolume_writers *writers; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + writers = btrfs_alloc_subvolume_writers(); + if (IS_ERR(writers)) { + ret = PTR_ERR(writers); + goto fail; + } + root->subv_writers = writers; + + btrfs_init_free_ino_ctl(root); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto free_writers; + return 0; + +free_writers: + btrfs_free_subvolume_writers(root->subv_writers); +fail: + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + return ret; +} + +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)root_objectid); + (unsigned long)root_id); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + bool check_ref) { struct btrfs_root *root; int ret; @@ -1185,41 +1636,41 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; + if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + return fs_info->quota_root ? fs_info->quota_root : + ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + return fs_info->uuid_root ? fs_info->uuid_root : + ERR_PTR(-ENOENT); again: - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)location->objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); - if (root) + root = btrfs_lookup_fs_root(fs_info, location->objectid); + if (root) { + if (check_ref && btrfs_root_refs(&root->root_item) == 0) + return ERR_PTR(-ENOENT); return root; + } - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); - if (ret == 0) - ret = -ENOENT; - if (ret < 0) - return ERR_PTR(ret); - - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; - WARN_ON(btrfs_root_refs(&root->root_item) == 0); - set_anon_super(&root->anon_super, NULL); + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = btrfs_init_fs_root(root); if (ret) goto fail; - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); - if (ret == 0) { - root->in_radix = 1; - root->clean_orphans = 1; - } - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, + location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + if (ret < 0) + goto fail; + if (ret == 0) + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + + ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { free_fs_root(root); @@ -1227,51 +1678,12 @@ again: } goto fail; } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - WARN_ON(ret); return root; fail: free_fs_root(root); return ERR_PTR(ret); } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - const char *name, int namelen) -{ - return btrfs_read_fs_root_no_name(fs_info, location); -#if 0 - struct btrfs_root *root; - int ret; - - root = btrfs_read_fs_root_no_name(fs_info, location); - if (!root) - return NULL; - - if (root->in_sysfs) - return root; - - ret = btrfs_set_root_name(root, name, namelen); - if (ret) { - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); - } - - ret = btrfs_sysfs_add_root(root); - if (ret) { - free_extent_buffer(root->node); - kfree(root->name); - kfree(root); - return ERR_PTR(ret); - } - root->in_sysfs = 1; - return root; -#endif -} - static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; @@ -1279,7 +1691,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) struct btrfs_device *device; struct backing_dev_info *bdi; - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); @@ -1288,86 +1701,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) break; } } + rcu_read_unlock(); return ret; } /* - * this unplugs every device on the box, and it is only used when page - * is null - */ -static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct btrfs_device *device; - struct btrfs_fs_info *info; - - info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { - if (!device->bdev) - continue; - - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); - } -} - -static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct inode *inode; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct address_space *mapping; - u64 offset; - - /* the generic O_DIRECT read code does this */ - if (1 || !page) { - __unplug_io_fn(bdi, page); - return; - } - - /* - * page->mapping may change at any time. Get a consistent copy - * and use that for everything below - */ - smp_mb(); - mapping = page->mapping; - if (!mapping) - return; - - inode = mapping->host; - - /* - * don't do the expensive searching for a small number of - * devices - */ - if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { - __unplug_io_fn(bdi, page); - return; - } - - offset = page_offset(page); - - em_tree = &BTRFS_I(inode)->extent_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - if (!em) { - __unplug_io_fn(bdi, page); - return; - } - - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - __unplug_io_fn(bdi, page); - return; - } - offset = offset - em->start; - btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, - em->block_start + offset, page); - free_extent_map(em); -} - -/* * If this fails, caller must call bdi_destroy() to get rid of the * bdi again. */ @@ -1375,64 +1713,17 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; - bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_init(bdi); + err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); if (err) return err; - err = bdi_register(bdi, NULL, "btrfs-%d", - atomic_inc_return(&btrfs_bdi_num)); - if (err) { - bdi_destroy(bdi); - return err; - } - bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->unplug_io_fn = btrfs_unplug_io_fn; - bdi->unplug_io_data = info; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; } -static int bio_ready_for_csum(struct bio *bio) -{ - u64 length = 0; - u64 buf_len = 0; - u64 start = 0; - struct page *page; - struct extent_io_tree *io_tree = NULL; - struct btrfs_fs_info *info = NULL; - struct bio_vec *bvec; - int i; - int ret; - - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - if (page->private == EXTENT_PAGE_PRIVATE) { - length += bvec->bv_len; - continue; - } - if (!page->private) { - length += bvec->bv_len; - continue; - } - length = bvec->bv_len; - buf_len = page->private >> 2; - start = page_offset(page) + bvec->bv_offset; - io_tree = &BTRFS_I(page->mapping->host)->io_tree; - info = BTRFS_I(page->mapping->host)->root->fs_info; - } - /* are we fully contained in this bio? */ - if (buf_len <= length) - return 1; - - ret = extent_range_uptodate(io_tree, start + length, - start + buf_len - 1); - return ret; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1441,57 +1732,56 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct end_io_wq *end_io_wq; - struct btrfs_fs_info *fs_info; int error; end_io_wq = container_of(work, struct end_io_wq, work); bio = end_io_wq->bio; - fs_info = end_io_wq->info; - /* metadata bio reads are special because the whole tree block must - * be checksummed at once. This makes sure the entire block is in - * ram and up to date before trying to verify things. For - * blocksize <= pagesize, it is basically a noop - */ - if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && - !bio_ready_for_csum(bio)) { - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - return; - } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kfree(end_io_wq); - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - smp_mb(); - if (root->fs_info->closing) - break; + again = 0; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + goto sleep; - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(root)) { mutex_unlock(&root->fs_info->cleaner_mutex); + goto sleep; } - if (freezing(current)) { - refrigerator(); - } else { - smp_mb(); - if (root->fs_info->closing) - break; + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(root->fs_info); +sleep: + if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + if (!kthread_should_stop()) + schedule(); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1503,56 +1793,350 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; + u64 transid; unsigned long now; unsigned long delay; - int ret; + bool cannot_commit; do { - smp_mb(); - if (root->fs_info->closing) - break; - - delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + cannot_commit = false; + delay = HZ * root->fs_info->commit_interval; mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); goto sleep; } now = get_seconds(); - if (now < cur->start_time || now - cur->start_time < 30) { - mutex_unlock(&root->fs_info->trans_mutex); + if (cur->state < TRANS_STATE_BLOCKED && + (now < cur->start_time || + now - cur->start_time < root->fs_info->commit_interval)) { + spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; } - mutex_unlock(&root->fs_info->trans_mutex); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); - + transid = cur->transid; + spin_unlock(&root->fs_info->trans_lock); + + /* If the file system is aborted, this will always fail. */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; + goto sleep; + } + if (transid == trans->transid) { + btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); + } sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - if (freezing(current)) { - refrigerator(); - } else { - if (root->fs_info->closing) - break; + if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, + &root->fs_info->fs_state))) + btrfs_cleanup_transaction(root); + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(delay); + if (!kthread_should_stop() && + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) + schedule_timeout(delay); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); return 0; } -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup workers */ +static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) +{ + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + btrfs_destroy_workqueue(fs_info->endio_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->rmw_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->submit_workers); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->readahead_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); +} + +static void free_root_extent_buffers(struct btrfs_root *root) +{ + if (root) { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + root->node = NULL; + root->commit_root = NULL; + } +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_root_extent_buffers(info->tree_root); + + free_root_extent_buffers(info->dev_root); + free_root_extent_buffers(info->extent_root); + free_root_extent_buffers(info->csum_root); + free_root_extent_buffers(info->quota_root); + free_root_extent_buffers(info->uuid_root); + if (chunk_root) + free_root_extent_buffers(info->chunk_root); +} + +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + btrfs_put_fs_root(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); + } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log_root_tree(NULL, fs_info); + btrfs_destroy_pinned_extent(fs_info->tree_root, + fs_info->pinned_extents); + } +} + +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -1563,27 +2147,28 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), - GFP_NOFS); - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; - - struct btrfs_super_block *disk_super; - - if (!extent_root || !tree_root || !fs_info || - !chunk_root || !dev_root || !csum_root) { + int num_backups_tried = 0; + int backup_index = 0; + int max_active; + int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + bool create_uuid_tree; + bool check_uuid_tree; + + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; } @@ -1600,56 +2185,129 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + + ret = percpu_counter_init(&fs_info->bio_counter, 0); + if (ret) { + err = ret; + goto fail_delalloc_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_bio_counter; } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); - INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); + INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - spin_lock_init(&fs_info->delalloc_lock); - spin_lock_init(&fs_info->new_trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->delalloc_root_lock); + spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->buffer_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; + fs_info->defrag_inodes = RB_ROOT; + fs_info->free_chunk_space = 0; + fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(&fs_info->ordered_extents); - spin_lock_init(&fs_info->ordered_extent_lock); + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_NOFS); + if (!fs_info->delayed_root) { + err = -ENOMEM; + goto fail_iput; + } + btrfs_init_delayed_root(fs_info->delayed_root); + + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif + + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1661,71 +2319,141 @@ struct btrfs_root *open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping, - GFP_NOFS); - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, - GFP_NOFS); + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; - insert_inode_hash(fs_info->btree_inode); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->btree_inode->i_mapping); extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->btree_inode->i_mapping); fs_info->pinned_extents = &fs_info->freed_extents[0]; fs_info->do_barriers = 1; - mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + sema_init(&fs_info->uuid_tree_rescan_sem, 1); + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = ret; + goto fail_alloc; + } + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + invalidate_bdev(fs_devices->latest_bdev); + /* + * Read super block and check the signature bytes only + */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) - goto fail_iput; + if (!bh) { + err = -EINVAL; + goto fail_alloc; + } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + /* + * We want to check superblock checksum, the type is stored inside. + * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). + */ + if (btrfs_check_super_csum(bh->b_data)) { + printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + err = -EINVAL; + goto fail_alloc; + } + + /* + * super_copy is zeroed at allocation time and we never touch the + * following bytes up to INFO_SIZE, the checksum is calculated from + * the whole block of INFO_SIZE + */ + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + err = -EINVAL; + goto fail_alloc; + } - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - goto fail_iput; + goto fail_alloc; + + /* check FS state, whether FS is broken. */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; - goto fail_iput; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -1733,102 +2461,156 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; - goto fail_iput; + goto fail_alloc; + } + + if (btrfs_super_leafsize(disk_super) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + printk(KERN_ERR "BTRFS: has skinny extents\n"); + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions + */ + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (sectorsize != leafsize)) { + printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_alloc; } + /* + * Needn't use the lock because there is no other task which will + * update the flag. + */ + btrfs_set_super_incompat_flags(disk_super, features); + features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; - goto fail_iput; + goto fail_alloc; } - btrfs_init_workers(&fs_info->generic_worker, - "genwork", 1, NULL); + max_active = fs_info->thread_pool_size; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); - btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - btrfs_init_workers(&fs_info->submit_workers, "submit", - min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->enospc_workers, "enospc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); - /* a higher idle thresh on the submit workers makes it much more + /* + * a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices */ - fs_info->submit_workers.idle_thresh = 64; - - fs_info->workers.idle_thresh = 16; - fs_info->workers.ordered = 1; - - fs_info->delalloc_workers.idle_thresh = 2; - fs_info->delalloc_workers.ordered = 1; - - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ - fs_info->endio_workers.idle_thresh = 4; - fs_info->endio_meta_workers.idle_thresh = 4; - - fs_info->endio_write_workers.idle_thresh = 2; - fs_info->endio_meta_write_workers.idle_thresh = 2; - - btrfs_start_workers(&fs_info->workers, 1); - btrfs_start_workers(&fs_info->generic_worker, 1); - btrfs_start_workers(&fs_info->submit_workers, 1); - btrfs_start_workers(&fs_info->delalloc_workers, 1); - btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); - btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->enospc_workers, 1); + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->fixup_workers && fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + err = -ENOMEM; + goto fail_sb_buffer; + } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); - nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); tree_root->nodesize = nodesize; tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; @@ -1837,9 +2619,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { - printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + if (sectorsize != PAGE_SIZE) { + printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -1847,7 +2634,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "btrfs: failed to read the system " + printk(KERN_WARNING "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -1862,30 +2649,38 @@ struct btrfs_root *open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + if (!chunk_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); + + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1893,50 +2688,129 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) - goto fail_tree_root; - extent_root->track_dirty = 1; + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_DEV_TREE_OBJECTID, dev_root); - if (ret) - goto fail_extent_root; - dev_root->track_dirty = 1; + extent_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(extent_root)) { + ret = PTR_ERR(extent_root); + goto recovery_tree_root; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); + fs_info->extent_root = extent_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_CSUM_TREE_OBJECTID, csum_root); - if (ret) - goto fail_dev_root; + location.objectid = BTRFS_DEV_TREE_OBJECTID; + dev_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(dev_root)) { + ret = PTR_ERR(dev_root); + goto recovery_tree_root; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); + fs_info->dev_root = dev_root; + btrfs_init_devices_late(fs_info); - csum_root->track_dirty = 1; + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + csum_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(csum_root)) { + ret = PTR_ERR(csum_root); + goto recovery_tree_root; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); + fs_info->csum_root = csum_root; - ret = btrfs_read_block_groups(extent_root); - if (ret) { - printk(KERN_ERR "Failed to read block groups: %d\n", ret); - goto fail_block_groups; + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + quota_root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(quota_root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = quota_root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + uuid_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + if (ret != -ENOENT) + goto recovery_tree_root; + create_uuid_tree = true; + check_uuid_tree = false; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); + fs_info->uuid_root = uuid_root; + create_uuid_tree = false; + check_uuid_tree = + generation != btrfs_super_uuid_tree_generation(disk_super); } fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + + ret = btrfs_recover_balance(fs_info); + if (ret) { + printk(KERN_WARNING "BTRFS: failed to recover balance\n"); + goto fail_block_groups; + } + + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", + ret); + goto fail_block_groups; + } + + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + + ret = btrfs_sysfs_add_one(fs_info); + if (ret) { + pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + goto fail_sysfs; + } + + ret = btrfs_read_block_groups(extent_root); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + goto fail_sysfs; + } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING "BTRFS: " + "too many missing devices, writeable mount is not allowed\n"); + goto fail_sysfs; + } + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) - goto fail_block_groups; + goto fail_sysfs; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, @@ -1947,26 +2821,50 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " "mode\n"); btrfs_set_opt(fs_info->mount_opt, SSD); } + /* Set the real inode map cache flag */ + if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) + btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "BTRFS: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; + + /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "Btrfs log replay required " + printk(KERN_WARNING "BTRFS: log replay required " "on RO media\n"); err = -EIO; - goto fail_trans_kthread; + goto fail_qgroup; } blocksize = btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) { + err = -ENOMEM; + goto fail_qgroup; + } __setup_root(nodesize, leafsize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); @@ -1974,46 +2872,119 @@ struct btrfs_root *open_ctree(struct super_block *sb, log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_qgroup; + } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); - BUG_ON(ret); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_qgroup; + } if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - BUG_ON(ret); + ret = btrfs_commit_super(tree_root); + if (ret) + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); - BUG_ON(ret); + if (ret) + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto fail_qgroup; + + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING - "btrfs: failed to recover relocation\n"); + "BTRFS: failed to recover relocation\n"); err = -EINVAL; - goto fail_trans_kthread; + goto fail_qgroup; } } location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; + location.offset = 0; fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (!fs_info->fs_root) - goto fail_trans_kthread; + if (IS_ERR(fs_info->fs_root)) { + err = PTR_ERR(fs_info->fs_root); + goto fail_qgroup; + } - if (!(sb->s_flags & MS_RDONLY)) { - down_read(&fs_info->cleanup_work_sem); - btrfs_orphan_cleanup(fs_info->fs_root); + if (sb->s_flags & MS_RDONLY) + return 0; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); + close_ctree(tree_root); + return ret; + } + up_read(&fs_info->cleanup_work_sem); + + ret = btrfs_resume_balance_async(fs_info); + if (ret) { + printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + close_ctree(tree_root); + return ret; + } + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("BTRFS: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; } - return tree_root; + btrfs_qgroup_rescan_resume(fs_info); + if (create_uuid_tree) { + pr_info("BTRFS: creating UUID tree\n"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else if (check_uuid_tree || + btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + pr_info("BTRFS: checking UUID tree\n"); + ret = btrfs_check_uuid_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to check the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else { + fs_info->update_uuid_tree_gen = 1; + } + + return 0; + +fail_qgroup: + btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); + btrfs_cleanup_transaction(fs_info->tree_root); + btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -2022,71 +2993,75 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_sysfs: + btrfs_sysfs_remove_one(fs_info); fail_block_groups: + btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); -fail_sb_buffer: - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); -fail_iput: + +fail_tree_roots: + free_root_pointers(fs_info, 1); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); - iput(fs_info->btree_inode); - btrfs_close_devices(fs_info->fs_devices); +fail_sb_buffer: + btrfs_stop_all_workers(fs_info); +fail_alloc: +fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); + + iput(fs_info->btree_inode); +fail_bio_counter: + percpu_counter_destroy(&fs_info->bio_counter); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); - return ERR_PTR(err); + btrfs_free_stripe_hash_table(fs_info); + btrfs_close_devices(fs_info->fs_devices); + return err; + +recovery_tree_root: + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { - if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + struct btrfs_device *device = (struct btrfs_device *) + bh->b_private; + + printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " + "I/O error on %s\n", + rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); } unlock_buffer(bh); put_bh(bh); @@ -2108,16 +3083,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) */ for (i = 0; i < 1; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) break; - bh = __bread(bdev, bytenr / 4096, 4096); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (!bh) continue; super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { + btrfs_super_magic(super) != BTRFS_MAGIC) { brelse(bh); continue; } @@ -2154,22 +3130,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2178,7 +3142,10 @@ static int write_dev_supers(struct btrfs_device *device, if (wait) { bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); - BUG_ON(!bh); + if (!bh) { + errors++; + continue; + } wait_on_buffer(bh); if (!buffer_uptodate(bh)) errors++; @@ -2193,7 +3160,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; - crc = btrfs_csum_data(NULL, (char *)sb + + crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -2205,6 +3172,13 @@ static int write_dev_supers(struct btrfs_device *device, */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + if (!bh) { + printk(KERN_ERR "BTRFS: couldn't get super " + "buffer head for bytenr %Lu\n", bytenr); + errors++; + continue; + } + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); /* one reference for submit_bh */ @@ -2213,31 +3187,228 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } - if (i == last_barrier && do_barriers && device->barriers) { - ret = submit_bh(WRITE_BARRIER, bh); - if (ret == -EOPNOTSUPP) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); - set_buffer_uptodate(bh); - device->barriers = 0; - /* one reference for submit_bh */ - get_bh(bh); - lock_buffer(bh); - ret = submit_bh(WRITE_SYNC, bh); - } - } else { - ret = submit_bh(WRITE_SYNC, bh); - } - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + if (i == 0) + ret = btrfsic_submit_bh(WRITE_FUA, bh); + else + ret = btrfsic_submit_bh(WRITE_SYNC, bh); if (ret) errors++; } return errors < i ? 0 : -1; } -int write_all_supers(struct btrfs_root *root, int max_mirrors) +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk_in_rcu("BTRFS: disabling barriers on dev %s\n", + rcu_str_deref(device->name)); + device->nobarriers = 1; + } else if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL; + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors_send = 0; + int errors_wait = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->bdev) { + errors_send++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors_send++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->bdev) { + errors_wait++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors_wait++; + } + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) + return -EIO; + return 0; +} + +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID6) { + num_tolerated_disk_barrier_failures = 2; + } + } + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + +static int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; @@ -2249,15 +3420,28 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - list_for_each_entry(dev, head, dev_list) { + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } + + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; @@ -2284,13 +3468,18 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) total_errors++; } if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", + btrfs_err(root->fs_info, "%d errors while writing supers", total_errors); - BUG(); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* FUA is masked off if unsupported and can't be the reason */ + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } total_errors = 0; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) continue; if (!dev->in_fs_metadata || !dev->writeable) @@ -2302,9 +3491,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", - total_errors); - BUG(); + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } return 0; } @@ -2312,13 +3501,12 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors) { - int ret; - - ret = write_all_supers(root, max_mirrors); - return ret; + return write_all_supers(root, max_mirrors); } -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -2328,104 +3516,107 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + btrfs_free_log(NULL, root); + + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); - return 0; } static void free_fs_root(struct btrfs_root *root) { + iput(root->cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_super.s_dev) { - down_write(&root->anon_super.s_umount); - kill_anon_super(&root->anon_super); - } + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + if (root->subv_writers) + btrfs_free_subvolume_writers(root->subv_writers); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); kfree(root->name); - kfree(root); + btrfs_put_fs_root(root); } -static int del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_root(struct btrfs_root *root) { - int ret; - struct btrfs_root *gang[8]; - int i; - - while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); - - if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); - } - } - - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); - } - return 0; + free_fs_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i; - int ret; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); break; - + } root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; root_objectid = gang[i]->root_key.objectid; - btrfs_orphan_cleanup(gang[i]); + err = btrfs_orphan_cleanup(gang[i]); + if (err) + break; + btrfs_put_fs_root(gang[i]); } root_objectid++; } - return 0; + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; } int btrfs_commit_super(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + wake_up_process(root->fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - /* run commit again to drop the original snapshot */ - trans = btrfs_start_transaction(root, 1); - btrfs_commit_transaction(trans, root); - ret = btrfs_write_and_wait_transaction(NULL, root); - BUG_ON(ret); - - ret = write_ctree_super(NULL, root, 0); - return ret; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans, root); } int close_ctree(struct btrfs_root *root) @@ -2436,186 +3627,555 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al., set sem back to initial state */ + up(&fs_info->uuid_tree_rescan_sem); + + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); + + btrfs_scrub_cancel(fs_info); + + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + + /* clear out the rbtree of defraggable inodes */ + btrfs_cleanup_defrag_inodes(fs_info); + + cancel_work_sync(&fs_info->async_reclaim_work); if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(root); if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + btrfs_err(root->fs_info, "commit super ret %d", ret); } + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + btrfs_error_commit_super(root); + + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); + fs_info->closing = 2; smp_mb(); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); - } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); + btrfs_free_qgroup_config(root->fs_info); + + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + btrfs_info(root->fs_info, "at unmount delalloc count %lld", + percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_extent_buffer(fs_info->extent_root->node); - free_extent_buffer(fs_info->extent_root->commit_root); - free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(root->fs_info->dev_root->commit_root); - free_extent_buffer(root->fs_info->csum_root->node); - free_extent_buffer(root->fs_info->csum_root->commit_root); + btrfs_sysfs_remove_one(fs_info); + + btrfs_free_fs_roots(fs_info); - btrfs_free_block_groups(root->fs_info); + btrfs_put_block_group_cache(fs_info); - del_fs_roots(fs_info); + btrfs_free_block_groups(fs_info); + + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + btrfs_stop_all_workers(fs_info); + + free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->bio_counter); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); + btrfs_free_stripe_hash_table(fs_info); + + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + return 0; } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic) { int ret; - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, - NULL); + ret = extent_buffer_uptodate(buf); if (!ret) return ret; ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid); + parent_transid, atomic); + if (ret == -EAGAIN) + return ret; return !ret; } int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, - buf); + return set_extent_buffer_uptodate(buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root; u64 transid = btrfs_header_generation(buf); - struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* + * This is a fast path so only do this check if we have sanity tests + * enabled. Normal people shouldn't be marking dummy buffers as dirty + * outside of the sanity tests. + */ + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags))) + return; +#endif + root = BTRFS_I(buf->pages[0]->mapping->host)->root; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", - (unsigned long long)buf->start, - (unsigned long long)transid, - (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); + buf->start, transid, root->fs_info->generation); + was_dirty = set_extent_buffer_dirty(buf); + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); } +#endif } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) { /* * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; - num_dirty = root->fs_info->dirty_metadata_bytes; + if (flush_delayed) + btrfs_balance_delayed_items(root); - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } +void btrfs_btree_balance_dirty(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 1); +} + +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); +} + int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - int ret; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); +} + +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + /* + * Placeholder for checks + */ + return 0; +} + +static void btrfs_error_commit_super(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); +} + +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_root_lock); + + list_splice_init(&t->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); + + btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->ordered_root_lock); + } + + spin_unlock(&root->fs_info->ordered_root_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); +} + +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&root->ordered_extent_lock); + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + spin_unlock(&root->ordered_extent_lock); +} + +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + + spin_unlock(&fs_info->ordered_root_lock); + btrfs_destroy_ordered_extents(root); + + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (atomic_read(&delayed_refs->num_entries) == 0) { + spin_unlock(&delayed_refs->lock); + btrfs_info(root->fs_info, "delayed_refs has NO entry"); + return ret; + } + + while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; + bool pin_bytes = false; + + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + spin_lock(&delayed_refs->lock); + continue; + } + spin_lock(&head->lock); + while ((node = rb_first(&head->ref_root)) != NULL) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ref->in_tree = 0; + rb_erase(&ref->rb_node, &head->ref_root); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + atomic_dec(&delayed_refs->num_entries); + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (pin_bytes) + btrfs_pin_extent(root, head->node.bytenr, + head->node.num_bytes, 1); + btrfs_put_delayed_ref(&head->node); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + return ret; } -int btree_lock_page_hook(struct page *page) +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); + spin_unlock(&root->delalloc_lock); + + btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->delalloc_lock); + } + + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + list_del_init(&root->delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) { - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; struct extent_buffer *eb; - unsigned long len; - u64 bytenr = page_offset(page); + u64 start = 0; + u64 end; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL); + if (ret) + break; - len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); - if (!eb) - goto out; + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + eb = btrfs_find_tree_block(root, start, + root->leafsize); + start += root->leafsize; + if (!eb) + continue; + wait_on_extent_buffer_writeback(eb); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags)) + clear_extent_buffer_dirty(eb); + free_extent_buffer_stale(eb); + } + } - btrfs_tree_lock(eb); - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + return ret; +} - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + bool loop = true; + + unpin = pinned_extents; +again: + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, NULL); + if (ret) + break; + + /* opt_discard */ + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_error_discard_extent(root, start, + end + 1 - start, + NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + if (loop) { + if (unpin == &root->fs_info->freed_extents[0]) + unpin = &root->fs_info->freed_extents[1]; else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); + unpin = &root->fs_info->freed_extents[0]; + loop = false; + goto again; } - btrfs_tree_unlock(eb); - free_extent_buffer(eb); -out: - lock_page(page); + return 0; +} + +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + btrfs_destroy_ordered_operations(cur_trans, root); + + btrfs_destroy_delayed_refs(cur_trans, root); + + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&root->fs_info->transaction_blocked_wait); + + cur_trans->state = TRANS_STATE_UNBLOCKED; + wake_up(&root->fs_info->transaction_wait); + + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + + btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + EXTENT_DIRTY); + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + + /* + memset(cur_trans, 0, sizeof(*cur_trans)); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + */ +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + spin_lock(&root->fs_info->trans_lock); + while (!list_empty(&root->fs_info->trans_list)) { + t = list_first_entry(&root->fs_info->trans_list, + struct btrfs_transaction, list); + if (t->state >= TRANS_STATE_COMMIT_START) { + atomic_inc(&t->use_count); + spin_unlock(&root->fs_info->trans_lock); + btrfs_wait_for_commit(root, t->transid); + btrfs_put_transaction(t); + spin_lock(&root->fs_info->trans_lock); + continue; + } + if (t == root->fs_info->running_transaction) { + t->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + /* + * We wait for 0 num_writers since we don't hold a trans + * handle open currently for this transaction. + */ + wait_event(t->writer_wait, + atomic_read(&t->num_writers) == 0); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + btrfs_cleanup_one_transaction(t, root); + + spin_lock(&root->fs_info->trans_lock); + if (t == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + list_del_init(&t->list); + spin_unlock(&root->fs_info->trans_lock); + + btrfs_put_transaction(t); + trace_btrfs_transaction_commit(root); + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(root->fs_info); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + btrfs_destroy_all_delalloc_inodes(root->fs_info); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + return 0; } static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c958ecbc191..23ce3ceba0a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { + BTRFS_WQ_ENDIO_DATA = 0, + BTRFS_WQ_ENDIO_METADATA = 1, + BTRFS_WQ_ENDIO_FREE_SPACE = 2, + BTRFS_WQ_ENDIO_RAID56 = 3, +}; + static inline u64 btrfs_sb_offset(int mirror) { u64 start = 16 * 1024; @@ -40,13 +47,15 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); @@ -54,61 +63,94 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid); -struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - const char *name, int namelen); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location); -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location); +int btrfs_init_fs_root(struct btrfs_root *root); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key, + bool check_ref); +static inline struct btrfs_root * +btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + return btrfs_get_fs_root(fs_info, location, true); +} + int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -int btrfs_insert_dev_radix(struct btrfs_root *root, - struct block_device *bdev, - u64 device_id, - u64 block_start, - u64 num_blocks); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_root *root); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_root *btrfs_alloc_dummy_root(void); +#endif + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); -void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); -int wait_on_tree_block_writeback(struct btrfs_root *root, - struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); -int btrfs_open_device(struct btrfs_device *dev); -int btrfs_verify_block_csum(struct btrfs_root *root, - struct extent_buffer *buf); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, + unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); - -int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_root *root); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid); +int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level); #else -static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, - int level) +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) { } #endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 951ef09b82f..41422a3de8e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" -#include "compat.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) @@ -13,38 +12,35 @@ parent_root_objectid) / 4) #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) -static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, - int connectable) +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { struct btrfs_fid *fid = (struct btrfs_fid *)fh; - struct inode *inode = dentry->d_inode; int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) - return 255; + if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; + return FILEID_INVALID; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return FILEID_INVALID; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; - fid->objectid = inode->i_ino; + fid->objectid = btrfs_ino(inode); fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; + if (parent) { u64 parent_root_id; - spin_lock(&dentry->d_lock); - - parent = dentry->d_parent->d_inode; fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; parent_root_id = BTRFS_I(parent)->root->objectid; - spin_unlock(&dentry->d_lock); - if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; @@ -63,9 +59,8 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation, int check_generation) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct dentry *dentry; struct inode *inode; struct btrfs_key key; int index; @@ -86,11 +81,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, goto fail; } - if (btrfs_root_refs(&root->root_item) == 0) { - err = -ENOENT; - goto fail; - } - key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; @@ -108,10 +98,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) - dentry->d_op = &btrfs_dentry_operations; - return dentry; + return d_obtain_alias(inode); fail: srcu_read_unlock(&fs_info->subvol_srcu, index); return ERR_PTR(err); @@ -166,7 +153,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - static struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -176,14 +162,16 @@ static struct dentry *btrfs_get_parent(struct dentry *child) int ret; path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); - if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = root->fs_info->tree_root; } else { - key.objectid = dir->i_ino; + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -192,7 +180,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) if (ret < 0) goto fail; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; @@ -223,18 +211,94 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); - if (!IS_ERR(dentry)) - dentry->d_op = &btrfs_dentry_operations; - return dentry; + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); fail: btrfs_free_path(path); return ERR_PTR(ret); } +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + u64 ino; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = ino; + key.offset = btrfs_ino(dir); + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9e23ffea7f5..813537f362f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,28 +23,66 @@ #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> -#include "compat.h" +#include <linux/ratelimit.h> +#include <linux/percpu_counter.h> #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" +#include "sysfs.h" +#include "qgroup.h" -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free); -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); +#undef SCRAMBLE_DELAYED_REFS + +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum { + CHUNK_ALLOC_NO_FORCE = 0, + CHUNK_ALLOC_LIMITED = 1, + CHUNK_ALLOC_FORCE = 2, +}; + +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + +static int update_block_group(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -57,26 +95,29 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean); + struct btrfs_root *extent_root, u64 flags, + int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, + int delalloc); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) { smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; } static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -84,15 +125,19 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) { atomic_inc(&cache->count); } void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { - if (atomic_dec_and_test(&cache->count)) + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + kfree(cache->free_space_ctl); kfree(cache); + } } /* @@ -126,6 +171,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -167,8 +216,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -212,7 +264,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -220,13 +273,35 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); + if (ret) + return ret; while (nr--) { - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, logical[nr], - stripe_len); - BUG_ON(ret); + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = add_excluded_extent(root, start, len); + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -245,6 +320,12 @@ get_caching_control(struct btrfs_block_group_cache *cache) return NULL; } + /* We're loading it the fast way, so we don't have a caching_ctl. */ + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + ctl = cache->caching_ctl; atomic_inc(&ctl->count); spin_unlock(&cache->lock); @@ -271,7 +352,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, while (start < end) { ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); if (ret) break; @@ -282,7 +364,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { break; @@ -293,34 +375,34 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, size = end - start; total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ } return total_added; } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work) { - struct btrfs_block_group_cache *block_group = data; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = 0; + int ret = -ENOMEM; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - return -ENOMEM; - - exclude_super_stripes(extent_root, block_group); - spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_super += block_group->bytes_super; - spin_unlock(&block_group->space_info->lock); + goto out; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -332,7 +414,7 @@ static int caching_kthread(void *data) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = 1; key.objectid = last; key.offset = 0; @@ -340,8 +422,9 @@ static int caching_kthread(void *data) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); +next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -350,8 +433,7 @@ again: nritems = btrfs_header_nritems(leaf); while (1) { - smp_mb(); - if (fs_info->closing > 1) { + if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } @@ -363,15 +445,34 @@ again: if (ret) break; - caching_ctl->progress = last; - btrfs_release_path(extent_root, path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - else + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + goto again; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto err; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; } if (key.objectid < block_group->key.objectid) { @@ -383,11 +484,16 @@ again: block_group->key.offset) break; - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); - last = key.objectid + key.offset; + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->tree_root->leafsize; + else + last = key.objectid + key.offset; if (total_found > (1024 * 1024 * 2)) { total_found = 0; @@ -410,66 +516,134 @@ again: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); +out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); - atomic_dec(&block_group->space_info->caching_threads); btrfs_put_block_group(block_group); - - return 0; } -static int cache_block_group(struct btrfs_block_group_cache *cache) +static int cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; - struct task_struct *tsk; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); - BUG_ON(!caching_ctl); + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); + atomic_set(&caching_ctl->count, 1); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); kfree(caching_ctl); return 0; } + WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; + cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - down_write(&fs_info->extent_commit_sem); + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + ret = load_free_space_cache(fs_info, cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + } else { + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + if (ret == 1) { + put_caching_control(caching_ctl); + free_excluded_extents(fs_info->extent_root, cache); + return 0; + } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + } + + if (load_cache_only) { + put_caching_control(caching_ctl); + return 0; + } + + down_write(&fs_info->commit_root_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); - atomic_inc(&cache->space_info->caching_threads); btrfs_get_block_group(cache); - tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", - cache->key.objectid); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - printk(KERN_ERR "error running thread %d\n", ret); - BUG(); - } + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -507,9 +681,11 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags == flags) { + if (found->flags & flags) { rcu_read_unlock(); return found; } @@ -533,64 +709,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner) -{ - struct btrfs_block_group_cache *cache; - u64 used; - u64 last = max(search_hint, search_start); - u64 group_start = 0; - int full_search = 0; - int factor = 9; - int wrapped = 0; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache) - break; - - spin_lock(&cache->lock); - last = cache->key.objectid + cache->key.offset; - used = btrfs_block_group_used(&cache->item); - - if ((full_search || !cache->ro) && - block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { - if (used + cache->pinned + cache->reserved < - div_factor(cache->key.offset, factor)) { - group_start = cache->key.objectid; - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - goto found; - } - } - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - cond_resched(); - } - if (!wrapped) { - last = search_start; - wrapped = 1; - goto again; - } - if (!full_search && factor < 10) { - last = search_start; - full_search = 1; - factor = 10; - goto again; - } -found: - return group_start; -} - /* simple helper to search for an existing extent at a given offset */ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) { @@ -599,12 +717,167 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) struct btrfs_path *path; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + key.objectid = start; key.offset = len; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); + if (ret > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == start && + key.type == BTRFS_METADATA_ITEM_KEY) + ret = 0; + } + btrfs_free_path(path); + return ret; +} + +/* + * helper function to lookup reference count and flags of a tree block. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { + offset = root->leafsize; + metadata = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->leafsize) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } + } + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and try + * again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto search_again; + } + spin_lock(&head->lock); + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: btrfs_free_path(path); return ret; } @@ -746,7 +1019,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; - BUG_ON(ret > 0); + BUG_ON(ret > 0); /* Corruption */ leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, @@ -762,7 +1035,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, break; } } - btrfs_release_path(root, path); + btrfs_release_path(path); if (owner < BTRFS_FIRST_FREE_OBJECTID) new_size += sizeof(*bi); @@ -772,10 +1045,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, new_size + extra_size, 1); if (ret < 0) return ret; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ - ret = btrfs_extend_item(trans, root, path, new_size); - BUG_ON(ret); + btrfs_extend_item(root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -805,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -870,7 +1142,7 @@ again: return 0; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 key.type = BTRFS_EXTENT_REF_V0_KEY; - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { err = ret; @@ -908,7 +1180,7 @@ again: if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) { if (recow) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } err = 0; @@ -969,7 +1241,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) break; - btrfs_release_path(root, path); + btrfs_release_path(path); key.offset++; ret = btrfs_insert_empty_item(trans, root, path, &key, size); @@ -995,14 +1267,14 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); ret = 0; fail: - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1038,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1121,7 +1394,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, ret = -ENOENT; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (ret == -ENOENT && parent) { - btrfs_release_path(root, path); + btrfs_release_path(path); key.type = BTRFS_EXTENT_REF_V0_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) @@ -1150,7 +1423,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } @@ -1227,6 +1500,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int want; int ret; int err = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1238,12 +1513,54 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, path->keep_locks = 1; } else extra_size = -1; + + /* + * Owner is our parent level, so we can just add one to get the level + * for the block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) { err = ret; goto out; } - BUG_ON(ret); + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + + if (ret && !insert) { + err = -ENOENT; + goto out; + } else if (WARN_ON(ret)) { + err = -EIO; + goto out; + } leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1271,11 +1588,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } err = -ENOENT; @@ -1357,13 +1672,12 @@ out: * helper to add new inline back ref */ static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1373,7 +1687,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 refs; int size; int type; - int ret; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1382,8 +1695,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - ret = btrfs_extend_item(trans, root, path, size); - BUG_ON(ret); + btrfs_extend_item(root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1418,7 +1730,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } btrfs_mark_buffer_dirty(leaf); - return 0; } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1436,7 +1747,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, if (ret != -ENOENT) return ret; - btrfs_release_path(root, path); + btrfs_release_path(path); *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { @@ -1453,12 +1764,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1469,7 +1780,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, u32 item_size; int size; int type; - int ret; u64 refs; leaf = path->nodes[0]; @@ -1503,6 +1813,7 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1511,11 +1822,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); - BUG_ON(ret); + btrfs_truncate_item(root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); - return 0; } static noinline_for_stack @@ -1535,13 +1844,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); + update_inline_extent_backref(root, path, iref, + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); + setup_inline_extent_backref(root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } return ret; } @@ -1569,77 +1878,101 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { - int ret; + int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); + update_inline_extent_backref(root, path, iref, + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; } -static void btrfs_issue_discard(struct block_device *bdev, +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - DISCARD_FL_BARRIER); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) + u64 num_bytes, u64 *actual_bytes) { int ret; - u64 map_length = num_bytes; - struct btrfs_multi_bio *multi = NULL; + u64 discarded_bytes = 0; + struct btrfs_bio *bbio = NULL; - if (!btrfs_test_opt(root, DISCARD)) - return 0; /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &map_length, &multi, 0); + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - if (map_length > num_bytes) - map_length = num_bytes; - for (i = 0; i < multi->num_stripes; i++, stripe++) { - btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - map_length); + for (i = 0; i < bbio->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length); + if (!ret) + discarded_bytes += stripe->length; + else if (ret != -EOPNOTSUPP) + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; } - kfree(multi); + kfree(bbio); } + if (actual_bytes) + *actual_bytes = discarded_bytes; + + + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } +/* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1649,55 +1982,89 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; - int err = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret == 0) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; - - if (ret != -EAGAIN) { - err = ret; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); + + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); goto out; } + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root->fs_info->extent_root, path); + btrfs_release_path(path); + + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -1718,16 +2085,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); + if (extent_op) flags |= extent_op->flags_to_set; - } ret = alloc_reserved_file_extent(trans, root, parent, ref_root, flags, ref->objectid, ref->offset, @@ -1737,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -1780,15 +2146,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; + int metadata = !extent_op->is_data; + + if (trans->aborted) + return 0; + + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + metadata = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + } + +again: path->reada = 1; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, @@ -1798,8 +2178,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, goto out; } if (ret > 0) { - err = -EIO; - goto out; + if (metadata) { + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; + + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; + } } leaf = path->nodes[0]; @@ -1837,41 +2238,50 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; + + ins.objectid = node->bytenr; + if (skinny_metadata) { + ins.offset = ref->level; + ins.type = BTRFS_METADATA_ITEM_KEY; + } else { + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + } BUG_ON(node->ref_mod != 1); if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, root, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } return ret; } - /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1879,7 +2289,15 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) { - int ret; + int ret = 0; + + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); + return 0; + } + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* @@ -1890,36 +2308,18 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - if (insert_reserved) { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - - ret = pin_down_bytes(trans, root, NULL, - node->bytenr, node->num_bytes, - head->is_data, 1, &must_clean); - if (ret > 0) - mark_free = 1; + trace_run_delayed_ref_head(node, head, node->action); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } + if (insert_reserved) { + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); - BUG_ON(ret); - } - if (mark_free) { - ret = btrfs_free_reserved_extent(root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); } } - mutex_unlock(&head->mutex); - return 0; + return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -1939,59 +2339,62 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; + else if (last == NULL) + last = ref; + node = rb_next(node); } - return NULL; + return last; } -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2006,6 +2409,37 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + + if (ref && ref->seq && + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { + spin_unlock(&locked_ref->lock); + btrfs_delayed_ref_unlock(locked_ref); + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + locked_ref = NULL; + cond_resched(); + count++; + continue; + } + + /* * record the must insert reserved flag before we * drop the spin lock. */ @@ -2015,12 +2449,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2028,45 +2459,308 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); - BUG_ON(ret); - kfree(extent_op); - - cond_resched(); - spin_lock(&delayed_refs->lock); + btrfs_free_delayed_extent_op(extent_op); + + if (ret) { + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + btrfs_delayed_ref_unlock(locked_ref); + return ret; + } continue; } - list_del_init(&locked_ref->cluster); - locked_ref = NULL; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; + } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - - spin_unlock(&delayed_refs->lock); + if (!btrfs_delayed_ref_is_head(ref)) { + /* + * when we play the delayed ref, also correct the + * ref_mod on head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->node.ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->node.ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + } + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - BUG_ON(ret); + btrfs_free_delayed_extent_op(extent_op); + if (ret) { + locked_ref->processing = 0; + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); + return ret; + } + + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } btrfs_put_delayed_ref(ref); - kfree(extent_op); count++; - cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); } - return count; + return 0; +} + +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 num_bytes; + int ret = 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->leafsize; + num_bytes <<= 1; + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) + num_bytes <<= 1; + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; } /* @@ -2075,97 +2769,101 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * 0, which means to process everything in the tree at the start * of the run (but not newly added entries), or it can be some target * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; int run_most = 0; + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); -again: - spin_lock(&delayed_refs->lock); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - while (1) { - if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); - - count -= min_t(unsigned long, ret, count); - if (count == 0) - break; +again: +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - node = rb_first(&delayed_refs->root); - if (!node) + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); + /* + * Mutex was contended, block until it's + * released and try again + */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - spin_unlock(&delayed_refs->lock); + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; + assert_qgroups_uptodate(trans); return 0; } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2173,10 +2871,12 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -2192,56 +2892,58 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); - btrfs_release_path(root->fs_info->extent_root, path); + btrfs_release_path(path); + /* + * Mutex was contended, block until it's released and let + * caller try again + */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - data_ref = btrfs_delayed_node_to_data_ref(ref); + data_ref = btrfs_delayed_node_to_data_ref(ref); - node = rb_prev(node); - if (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -2266,7 +2968,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = -ENOENT; if (path->slots[0] == 0) @@ -2347,133 +3049,15 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, ret = 0; out: btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); return ret; } -#if 0 -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, u32 nr_extents) -{ - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u64 root_gen; - u32 nritems; - int i; - int level; - int ret = 0; - int shared = 0; - - if (!root->ref_cows) - return 0; - - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - shared = 0; - root_gen = root->root_key.offset; - } else { - shared = 1; - root_gen = trans->transid - 1; - } - - level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); - - if (level == 0) { - struct btrfs_leaf_ref *ref; - struct btrfs_extent_info *info; - - ref = btrfs_alloc_leaf_ref(root, nr_extents); - if (!ref) { - ret = -ENOMEM; - goto out; - } - - ref->root_gen = root_gen; - ref->bytenr = buf->start; - ref->owner = btrfs_header_owner(buf); - ref->generation = btrfs_header_generation(buf); - ref->nritems = nr_extents; - info = ref->extents; - - for (i = 0; nr_extents > 0 && i < nritems; i++) { - u64 disk_bytenr; - btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (disk_bytenr == 0) - continue; - - info->bytenr = disk_bytenr; - info->num_bytes = - btrfs_file_extent_disk_num_bytes(buf, fi); - info->objectid = key.objectid; - info->offset = key.offset; - info++; - } - - ret = btrfs_add_leaf_ref(root, ref, shared); - if (ret == -EEXIST && shared) { - struct btrfs_leaf_ref *old; - old = btrfs_lookup_leaf_ref(root, ref->bytenr); - BUG_ON(!old); - btrfs_remove_leaf_ref(root, old); - btrfs_free_leaf_ref(root, old); - ret = btrfs_add_leaf_ref(root, ref, shared); - } - WARN_ON(ret); - btrfs_free_leaf_ref(root, ref); - } -out: - return ret; -} - -/* when a block goes through cow, we update the reference counts of - * everything that block points to. The internal pointers of the block - * can be in just about any order, and it is likely to have clusters of - * things that are close together and clusters of things that are not. - * - * To help reduce the seeks that come with updating all of these reference - * counts, sort them by byte number before actual updates are done. - * - * struct refsort is used to match byte number to slot in the btree block. - * we sort based on the byte number and then use the slot to actually - * find the item. - * - * struct refsort is smaller than strcut btrfs_item and smaller than - * struct btrfs_key_ptr. Since we're currently limited to the page size - * for a btree block, there's no way for a kmalloc of refsorts for a - * single node to be bigger than a page. - */ -struct refsort { - u64 bytenr; - u32 slot; -}; - -/* - * for passing into sort() - */ -static int refsort_cmp(const void *a_void, const void *b_void) -{ - const struct refsort *a = a_void; - const struct refsort *b = b_void; - - if (a->bytenr < b->bytenr) - return -1; - if (a->bytenr > b->bytenr) - return 1; - return 0; -} -#endif - static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -2486,13 +3070,17 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -2523,34 +3111,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset); + key.offset, no_quota); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + parent, ref_root, level - 1, 0, + no_quota); if (ret) goto fail; } } return 0; fail: - BUG(); return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2566,16 +3154,18 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(extent_root, path); + btrfs_release_path(path); fail: - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } return 0; } @@ -2598,6 +3188,130 @@ next_block_group(struct btrfs_root *root, return cache; } +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_root *root = block_group->fs_info->tree_root; + struct inode *inode = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + int num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * 1024 * 1024)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + +again: + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(root, trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(root, trans, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE) || + block_group->delalloc_bytes) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + if (!num_pages) + num_pages = 1; + + num_pages *= 16; + num_pages *= PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; + btrfs_free_reserved_data_space(inode, num_pages); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -2610,15 +3324,40 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + err = cache_save_setup(cache, trans, path); + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + while (1) { if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->dirty) break; cache = next_block_group(root, cache); @@ -2630,16 +3369,67 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, continue; } + if (cache->disk_cache_state == BTRFS_DC_SETUP) + cache->disk_cache_state = BTRFS_DC_NEED_WRITE; cache->dirty = 0; last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); - BUG_ON(err); + btrfs_put_block_group(cache); + if (err) /* File system offline */ + goto out; + } + + while (1) { + /* + * I don't think this is needed since we're just marking our + * preallocated extent as written, but just in case it can't + * hurt. + */ + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + if (err) /* File system offline */ + goto out; + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + /* + * Really this shouldn't happen, but it could if we + * couldn't write the entire preallocated extent and + * splitting the extent resulted in a new block. + */ + if (cache->dirty) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + err = btrfs_write_out_cache(root, trans, cache, path); + + /* + * If we didn't have an error then the cache state is still + * NEED_WRITE, so we can set it to WRITTEN. + */ + if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + cache->disk_cache_state = BTRFS_DC_WRITTEN; + last = cache->key.objectid + cache->key.offset; btrfs_put_block_group(cache); } +out: btrfs_free_path(path); - return 0; + return err; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2655,17 +3445,45 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; + int i; + int factor; + int ret; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; found = __find_space_info(info, flags); if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; found->full = 0; spin_unlock(&found->lock); *space_info = found; @@ -2675,812 +3493,1926 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - INIT_LIST_HEAD(&found->block_groups); + ret = percpu_counter_init(&found->total_bytes_pinned, 0); + if (ret) { + kfree(found); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); - init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); - found->flags = flags; + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; - found->bytes_delalloc = 0; + found->bytes_may_use = 0; found->full = 0; - found->force_alloc = 0; + found->force_alloc = CHUNK_ALLOC_NO_FORCE; + found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); - atomic_set(&found->caching_threads, 0); - return 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); - if (extra_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - } + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } -static void set_block_group_readonly(struct btrfs_block_group_cache *cache) +/* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (!cache->ro) { - cache->space_info->bytes_readonly += cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->ro = 1; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + + return target; } -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + u64 num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + u64 target; + u64 tmp; + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ + spin_lock(&root->fs_info->balance_lock); + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { + spin_unlock(&root->fs_info->balance_lock); + return extended_to_chunk(target); + } + } + spin_unlock(&root->fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; + + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; + + return extended_to_chunk(flags | tmp); +} + +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(root, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +{ + u64 flags; + u64 ret; + + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + + ret = get_alloc_profile(root, flags); + return ret; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 used; + int ret = 0, committed = 0, alloc_chunk = 1; + + /* make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, root->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + committed = 1; + ASSERT(current->journal_info); } - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; + data_sinfo = fs_info->data_sinfo; + if (!data_sinfo) + goto alloc; + +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full && alloc_chunk) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); +alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } + + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation don't bother committing the transaction. + */ + if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, + bytes) < 0) + committed = 1; + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ +commit_trans: + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { + committed = 1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; } + data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + data_sinfo->flags, bytes, 1); + spin_unlock(&data_sinfo->lock); - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - return flags; + return 0; } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) +/* + * Called if we need to clear a data reservation for this inode. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { - struct btrfs_fs_info *info = root->fs_info; - u64 alloc_profile; - - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_space_info *data_sinfo; - return btrfs_reduce_alloc_profile(root, data); + /* make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, root->sectorsize); + + data_sinfo = root->fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); + data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + data_sinfo->flags, bytes, 0); + spin_unlock(&data_sinfo->lock); } -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +static void force_metadata_allocation(struct btrfs_fs_info *info) { - u64 alloc_target; + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; - alloc_target = btrfs_get_alloc_profile(root, 1); - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - alloc_target); + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); } -static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) { - u64 num_bytes; - int level; + return (global->size << 1); +} + +static int should_alloc_chunk(struct btrfs_root *root, + struct btrfs_space_info *sinfo, int force) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; - level = BTRFS_MAX_LEVEL - 2; /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += calc_global_rsv_need_space(global_rsv); /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. + * in limited mode, we want to have some free space up to + * about 1% of the FS size. */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); + thresh = max_t(u64, 64 * 1024 * 1024, + div_factor_fine(thresh, 1)); + + if (num_bytes - num_allocated < thresh) + return 1; + } + + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + return 0; + return 1; +} + +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; else - num_bytes = (num_items + (2 * num_items)) * 3; + num_dev = 1; /* DUP or single */ - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} - return num_bytes; +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } } -/* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. - */ -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 flags, int force) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; + struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; + int wait_for_alloc = 0; + int ret = 0; + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); /* -ENOMEM */ } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + BUG_ON(!space_info); /* Logic error */ - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); +again: + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; + if (!should_alloc_chunk(extent_root, space_info, force)) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + wait_for_alloc = 1; } else { - meta_sinfo->bytes_delalloc -= num_bytes; + space_info->chunk_alloc = 1; } - spin_unlock(&meta_sinfo->lock); - BUG_ON(bug); + spin_unlock(&space_info->lock); - return 0; + mutex_lock(&fs_info->chunk_mutex); + + /* + * The chunk_mutex is held throughout the entirety of a chunk + * allocation, so once we've acquired the chunk_mutex we know that the + * other guy is done and we need to recheck and see if we should + * allocate. + */ + if (wait_for_alloc) { + mutex_unlock(&fs_info->chunk_mutex); + wait_for_alloc = 0; + goto again; + } + + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; + if (ret) + space_info->full = 1; + else + ret = 1; + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + return ret; } -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { - u64 thresh; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 space_size; + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; else - meta_sinfo->force_delalloc = 0; + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; } -struct async_flush { - struct btrfs_root *root; - struct btrfs_space_info *info; - struct btrfs_work work; -}; +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = root->fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(root->fs_info, nr_items); + } +} -static noinline void flush_delalloc_async(struct btrfs_work *work) +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) { - struct async_flush *async; - struct btrfs_root *root; - struct btrfs_space_info *info; + u64 bytes; + int nr; - async = container_of(work, struct async_flush, work); - root = async->root; - info = async->info; + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} - btrfs_start_delalloc_inodes(root, 0); - wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0, 0); +#define EXTENT_SIZE_PER_ITEM (256 * 1024) - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 max_reclaim; + long time_left; + unsigned long nr_pages; + int loops; + int items; + enum btrfs_reserve_flush_enum flush; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + block_rsv = &root->fs_info->delalloc_block_rsv; + space_info = block_rsv->space_info; + + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); + if (delalloc_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); + return; + } + + loops = 0; + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; - kfree(async); + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, orig, flush)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(root->fs_info, items); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); + } } -static void wait_on_flush(struct btrfs_space_info *info) +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) { - DEFINE_WAIT(wait); - u64 used; + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; - while (1) { - prepare_to_wait(&info->flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&info->lock); - if (!info->flushing) { - spin_unlock(&info->lock); + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + +enum flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); break; } - - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { - spin_unlock(&info->lock); + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes * 2, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); break; } - spin_unlock(&info->lock); - schedule(); + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; } - finish_wait(&info->flush_wait, &wait); + + return ret; } -static void flush_delalloc(struct btrfs_root *root, - struct btrfs_space_info *info) +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) { - struct async_flush *async; - bool wait = false; + u64 used; + u64 expected; + u64 to_reclaim; - spin_lock(&info->lock); + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } - if (!info->flushing) - info->flushing = 1; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); else - wait = true; + expected = div_factor_fine(space_info->total_bytes, 90); - spin_unlock(&info->lock); + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); - if (wait) { - wait_on_flush(info); - return; + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; } + spin_unlock(&space_info->lock); - async = kzalloc(sizeof(*async), GFP_NOFS); - if (!async) - goto flush; + return 0; +} - async->root = root; - async->info = info; - async->work.func = flush_delalloc_async; +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; - btrfs_queue_worker(&root->fs_info->enospc_workers, - &async->work); - wait_on_flush(info); - return; + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); -flush: - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); } -static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) +void btrfs_init_async_reclaim_work(struct work_struct *work) { - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 used; + u64 num_bytes = orig_bytes; + int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - u64 min_metadata; - u64 free_space; + bool flushing = false; - free_space = btrfs_super_total_bytes(disk_super); +again: + ret = 0; + spin_lock(&space_info->lock); /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (current->journal_info) + return -EAGAIN; + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ + if (ret) + return -EINTR; - if (info->full) { - spin_unlock(&info->lock); - return 0; + spin_lock(&space_info->lock); } - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; + ret = -ENOSPC; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, orig_bytes, 1); + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } } else { - wait = true; + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = used - space_info->total_bytes + + (orig_bytes * 2); } - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; } - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + flushing = true; + space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } + spin_unlock(&space_info->lock); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); - if (ret) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; + + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; + + /* + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. + */ + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; + + if (!ret) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) + goto again; + out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; - if (ret) - return 0; - return 1; + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); + if (flushing) { + spin_lock(&space_info->lock); + space_info->flush = 0; + wake_up_all(&space_info->wait); + spin_unlock(&space_info->lock); + } + return ret; } -/* - * Reserve metadata space for delalloc. - */ -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; + struct btrfs_block_rsv *block_rsv = NULL; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + block_rsv = trans->block_rsv; - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); -again: - spin_lock(&meta_sinfo->lock); + if (root == root->fs_info->csum_root && trans->adding_csums) + block_rsv = trans->block_rsv; - force_delalloc = meta_sinfo->force_delalloc; + if (root == root->fs_info->uuid_root) + block_rsv = trans->block_rsv; - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + if (!block_rsv) + block_rsv = root->block_rsv; - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + return block_rsv; +} - if (used > meta_sinfo->total_bytes) { - flushed++; +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); - } +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); return -ENOSPC; } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { + spin_lock(&space_info->lock); + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + block_rsv_add_bytes(dst, num_bytes, 1); return 0; } -/* - * unreserve num_items number of items worth of metadata space. This needs to - * be paired with btrfs_reserve_metadata_space. - * - * NOTE: if you have the option, run this _AFTER_ you do a - * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref - * oprations which will result in more used metadata, so we want to make sure we - * can do that without issue. - */ -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; - num_bytes = calculate_bytes_needed(root, num_items); + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); + btrfs_init_block_rsv(block_rsv, type); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + return block_rsv; +} - BUG_ON(bug); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); +} - return 0; +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + return ret; } -/* - * Reserve some metadata space for use. We'll calculate the worste case number - * of bytes that would be needed to modify num_items number of items. If we - * have space, fantastic, if not, you get -ENOSPC. Please call - * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of - * items you reserved, since whatever metadata you needed should have already - * been allocated. - * - * This will commit the transaction to make more space if we don't have enough - * metadata space. THe only time we don't do this is if we're reserving space - * inside of a transaction, then we will just return -ENOSPC and it is the - * callers responsibility to handle it properly. - */ -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int retries = 0; + u64 num_bytes = 0; + int ret = -ENOSPC; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + if (!block_rsv) + return 0; - num_bytes = calculate_bytes_needed(root, num_items); -again: - spin_lock(&meta_sinfo->lock); + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + return ret; +} - if (!retries) - meta_sinfo->bytes_may_use += num_bytes; +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + if (!block_rsv) + return 0; - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); - } + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); - if (retries == 2) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); + if (!ret) + return 0; - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; } - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} - return 0; +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); } /* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; - int ret = 0, committed = 0; + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(fs_info->super_copy); - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); - data_sinfo = BTRFS_I(inode)->space_info; - if (!data_sinfo) - goto alloc; + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - if (data_sinfo->total_bytes - data_sinfo->bytes_used - - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { - struct btrfs_trans_handle *trans; + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); - data_sinfo->force_alloc = 1; - spin_unlock(&data_sinfo->lock); -alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); +} - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, - alloc_target, 0); - btrfs_end_transaction(trans, root); - if (ret) - return ret; +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } - goto again; - } - spin_unlock(&data_sinfo->lock); + num_bytes = calc_global_metadata_size(fs_info); - /* commit the current transaction and try again */ - if (!committed && !root->fs_info->open_ioctl_trans) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; - } + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); - printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" - ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use " - "%llu total\n", (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_delalloc, - (unsigned long long)data_sinfo->bytes_used, - (unsigned long long)data_sinfo->bytes_reserved, - (unsigned long long)data_sinfo->bytes_pinned, - (unsigned long long)data_sinfo->bytes_readonly, - (unsigned long long)data_sinfo->bytes_may_use, - (unsigned long long)data_sinfo->total_bytes); - return -ENOSPC; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 1); } - data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; - spin_unlock(&data_sinfo->lock); - return 0; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 0); + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } -/* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. - */ -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; + struct btrfs_space_info *space_info; - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; - data_sinfo = BTRFS_I(inode)->space_info; - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - spin_unlock(&data_sinfo->lock); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + update_global_block_rsv(fs_info); } -/* called when we are adding a delalloc extent to the inode's io_tree */ -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); +} - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->block_rsv) + return; - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; + if (!trans->bytes_reserved) + return; + + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +/* Can only return 0 or -ENOSPC */ +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved, + bool use_global_rsv) +{ + u64 num_bytes; + int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; + num_bytes = 0; } - spin_unlock(&data_sinfo->lock); + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; } -/* called when we are clearing an delalloc extent from the inode's io_tree */ -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) { - struct btrfs_space_info *info; + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); +} + +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ +static unsigned drop_outstanding_extent(struct inode *inode) +{ + unsigned drop_inode_space = 0; + unsigned dropped_extents = 0; - info = BTRFS_I(inode)->space_info; + BUG_ON(!BTRFS_I(inode)->outstanding_extents); + BTRFS_I(inode)->outstanding_extents--; - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); -} + if (BTRFS_I(inode)->outstanding_extents == 0 && + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + drop_inode_space = 1; -static void force_metadata_allocation(struct btrfs_fs_info *info) + /* + * If we have more or the same amount of outsanding extents than we have + * reserved then we need to leave the reserved extents count alone. + */ + if (BTRFS_I(inode)->outstanding_extents >= + BTRFS_I(inode)->reserved_extents) + return drop_inode_space; + + dropped_extents = BTRFS_I(inode)->reserved_extents - + BTRFS_I(inode)->outstanding_extents; + BTRFS_I(inode)->reserved_extents -= dropped_extents; + return dropped_extents + drop_inode_space; +} + +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = 1; - } - rcu_read_unlock(); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) { - struct btrfs_space_info *space_info; - struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve = 0; + u64 csum_bytes; + unsigned nr_extents = 0; + int extra_reserve = 0; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; + bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; + + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } - mutex_lock(&fs_info->chunk_mutex); + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); - flags = btrfs_reduce_alloc_profile(extent_root, flags); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); - space_info = __find_space_info(extent_root->fs_info, flags); - if (!space_info) { - ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); - BUG_ON(ret); + num_bytes = ALIGN(num_bytes, root->sectorsize); + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + + if (BTRFS_I(inode)->outstanding_extents > + BTRFS_I(inode)->reserved_extents) + nr_extents = BTRFS_I(inode)->outstanding_extents - + BTRFS_I(inode)->reserved_extents; + + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + nr_extents++; + extra_reserve = 1; } - BUG_ON(!space_info); - spin_lock(&space_info->lock); - if (space_info->force_alloc) - force = 1; - if (space_info->full) { - spin_unlock(&space_info->lock); - goto out; + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); + csum_bytes = BTRFS_I(inode)->csum_bytes; + spin_unlock(&BTRFS_I(inode)->lock); + + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + goto out_fail; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { - spin_unlock(&space_info->lock); - goto out; + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + goto out_fail; } - spin_unlock(&space_info->lock); + spin_lock(&BTRFS_I(inode)->lock); + if (extra_reserve) { + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + nr_extents--; + } + BTRFS_I(inode)->reserved_extents += nr_extents; + spin_unlock(&BTRFS_I(inode)->lock); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_reserve, 1); + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; } + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); - ret = btrfs_alloc_chunk(trans, extent_root, flags); - spin_lock(&space_info->lock); - if (ret) - space_info->full = 1; - space_info->force_alloc = 0; - spin_unlock(&space_info->lock); -out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free) +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { - struct btrfs_block_group_cache *cache; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free = 0; + unsigned dropped; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped > 0) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + +static int update_block_group(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc) +{ + struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; u64 total = num_bytes; u64 old_val; u64 byte_in_group; + int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) - return -1; + return -ENOENT; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + cache_block_group(cache, 1); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); + + if (btrfs_test_opt(root, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); @@ -3488,31 +5420,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; cache->space_info->bytes_reserved -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - cache->space_info->bytes_used -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - if (mark_free) { - int ret; - ret = btrfs_discard_extent(root, bytenr, - num_bytes); - WARN_ON(ret); - - ret = btrfs_add_free_space(cache, bytenr, - num_bytes); - WARN_ON(ret); - } + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3526,6 +5451,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -3536,18 +5468,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -3559,31 +5483,195 @@ int btrfs_pin_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); /* Logic error */ + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); + btrfs_put_block_group(cache); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + int ret; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!cache) + return -EINVAL; + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return ret; +} + +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } - set_extent_dirty(fs_info->pinned_extents, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); return 0; } -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. + */ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int delalloc) { - spin_lock(&cache->space_info->lock); + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve) { - cache->reserved += num_bytes; - cache->space_info->bytes_reserved += num_bytes; + if (reserve != RESERVE_FREE) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + num_bytes, 0); + space_info->bytes_may_use -= num_bytes; + } + + if (delalloc) + cache->delalloc_bytes += num_bytes; + } } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - return 0; + spin_unlock(&space_info->lock); + return ret; } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3591,7 +5679,7 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -3610,23 +5698,28 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); - return 0; + up_write(&fs_info->commit_root_sem); + + update_global_block_rsv(fs_info); } static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ } len = cache->key.objectid + cache->key.offset - start; @@ -3637,14 +5730,32 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) btrfs_add_free_space(cache, start, len); } - spin_lock(&cache->space_info->lock); + start += len; + space_info = cache->space_info; + + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - start += len; + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) @@ -3661,6 +5772,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + if (trans->aborted) + return 0; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else @@ -3668,79 +5782,50 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; - ret = btrfs_discard_extent(root, start, end + 1 - start); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, + end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end); cond_resched(); } - return ret; + return 0; } -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean) +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) { - int err = 0; - struct extent_buffer *buf; - - if (is_data) - goto pinit; - - /* - * discard is sloooow, and so triggering discards on - * individual btree blocks isn't a good plan. Just - * pin everything in discard mode. - */ - if (btrfs_test_opt(root, DISCARD)) - goto pinit; - - buf = btrfs_find_tree_block(root, bytenr, num_bytes); - if (!buf) - goto pinit; + struct btrfs_space_info *space_info; + u64 flags; - /* we can reuse a block if it hasn't been written - * and it is from this transaction. We can't - * reuse anything from the tree log root because - * it has tiny sub-transactions. - */ - if (btrfs_buffer_uptodate(buf, 0) && - btrfs_try_tree_lock(buf)) { - u64 header_owner = btrfs_header_owner(buf); - u64 header_transid = btrfs_header_generation(buf); - if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_transid == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; - return 1; - } - btrfs_tree_unlock(buf); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; } - free_extent_buffer(buf); -pinit: - if (path) - btrfs_set_path_blocking(path); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, reserved); - BUG_ON(err < 0); - return 0; + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); } + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -3756,6 +5841,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; path = btrfs_alloc_path(); if (!path) @@ -3767,6 +5859,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; BUG_ON(!is_data && refs_to_drop != 1); + if (is_data) + skinny_metadata = 0; + ret = lookup_extent_backref(trans, extent_root, path, &iref, bytenr, num_bytes, parent, root_objectid, owner_objectid, @@ -3783,6 +5878,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3796,36 +5896,73 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); - BUG_ON(ret); - btrfs_release_path(extent_root, path); + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + btrfs_release_path(path); path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); + if (ret > 0) + btrfs_print_leaf(extent_root, + path->nodes[0]); + } + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } - BUG_ON(ret); extent_slot = path->slots[0]; } - } else { + } else if (WARN_ON(ret == -ENOENT)) { btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); - printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + bytenr, parent, root_objectid, owner_objectid, + owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } else { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } leaf = path->nodes[0]; @@ -3835,9 +5972,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } - btrfs_release_path(extent_root, path); + btrfs_release_path(path); path->leave_spinning = 1; key.objectid = bytenr; @@ -3847,12 +5987,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -3861,7 +6004,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -3869,10 +6013,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs < refs_to_drop); + if (refs < refs_to_drop) { + btrfs_err(info, "trying to drop %d refs but we only have %Lu " + "for bytenr %Lu", refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -3888,13 +6039,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); - BUG_ON(ret); + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - if (found_extent) { BUG_ON(is_data && refs_to_drop != extent_data_ref_count(root, path, iref)); @@ -3907,50 +6060,50 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, 0, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - BUG_ON(ret); - btrfs_release_path(extent_root, path); - - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } + btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + } + + ret = update_block_group(root, bytenr, num_bytes, 0); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } + } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; - ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free); - BUG_ON(ret); + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); } +out: btrfs_free_path(path); return ret; } /* - * when we free an extent, it is possible (and likely) that we free the last + * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. @@ -3960,30 +6113,22 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; - int ret; + int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -3999,38 +6144,105 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); - BUG_ON(ret); + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - return 0; + return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_block_group_cache *cache = NULL; + int pin = 1; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL, 0); + BUG_ON(ret); /* -ENOMEM */ + } + + if (!last_ref) + return; + + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto out; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto out; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); + pin = 0; + } +out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't matter + * anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + btrfs_put_block_group(cache); +} + +/* Can return -ENOMEM */ +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int no_quota) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -4042,40 +6254,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); - BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, NULL); - BUG_ON(ret); + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, + NULL, no_quota); } return ret; } -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level) -{ - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) - blocksize; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - - return btrfs_free_extent(trans, root, bytenr, blocksize, - parent, root_objectid, level, 0); -} - -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } @@ -4089,102 +6286,216 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) * for our min num_bytes. Another option is to have it go ahead * and look in the rbtree for a free extent of a given size, but this * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. */ -static noinline int +static noinline void wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return; wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space >= num_bytes)); + (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); - return 0; } static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); + int ret = 0; caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, block_group_cache_done(cache)); - + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; put_caching_control(caching_ctl); - return 0; + return ret; +} + +int __get_raid_index(u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + +int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_raid_index(cache->flags); +} + +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; } enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: - * ins->objectid == block start + * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks + * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 exclude_start, u64 exclude_nr, - int data) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + u64 search_start = 0; + u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; - bool found_uncached_bg = false; + int index = __get_raid_index(flags); + int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; + bool use_cluster = true; + bool have_caching_bg = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; - space_info = __find_space_info(root->fs_info, data); + trace_find_free_extent(orig_root, num_bytes, empty_size, flags); + + space_info = __find_space_info(root->fs_info, flags); if (!space_info) { - printk(KERN_ERR "No space info for %d\n", data); + btrfs_err(root->fs_info, "No space info for %llu", flags); return -ENOSPC; } - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; + /* + * If the space info is for both data and metadata it means we have a + * small filesystem and we can't use the clustering stuff. + */ + if (btrfs_mixed_space_info(space_info)) + use_cluster = false; - if (data & BTRFS_BLOCK_GROUP_METADATA) { + if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -4202,7 +6513,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); /* @@ -4212,9 +6522,8 @@ ideal_cache: * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + if (block_group && block_group_bits(block_group, flags) && + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -4227,6 +6536,8 @@ ideal_cache: btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { + index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -4234,140 +6545,156 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups, list) { + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { u64 offset; int cached; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; -have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { - u64 free_percent; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * We only want to start kthread caching if we are at - * the point where we will wait for caching to make - * progress, or if our ideal search is over and we've - * found somebody to start caching. - */ - if (loop > LOOP_CACHING_NOWAIT || - (loop > LOOP_FIND_IDEAL && - atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group); - BUG_ON(ret); - } - found_uncached_bg = true; + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, flags)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID10; /* - * If loop is set for cached only, try the next block - * group. + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. */ - if (loop == LOOP_FIND_IDEAL) + if ((flags & extra) && !(block_group->flags & extra)) goto loop; } +have_block_group: cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; + if (unlikely(!cached)) { + ret = cache_block_group(block_group, 0); + BUG_ON(ret < 0); + ret = 0; + } + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; if (unlikely(block_group->ro)) goto loop; /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - } - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, + num_bytes, + used_block_group->key.objectid, + &max_extent_size); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = used_block_group; + } goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); + WARN_ON(last_ptr->block_group != used_block_group); +release_cluster: + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + btrfs_release_block_group(used_block_group, + delalloc); + goto unclustered_alloc; } - spin_unlock(&last_ptr->lock); -refill_cluster: + /* * this cluster didn't work out, free it and * start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - offset, num_bytes, - empty_cluster + empty_size); + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this * cluster */ offset = btrfs_alloc_from_cluster(block_group, - last_ptr, num_bytes, - search_start); + last_ptr, + num_bytes, + search_start, + &max_extent_size); if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -4391,8 +6718,23 @@ refill_cluster: goto loop; } +unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + if (block_group->free_space_ctl->free_space > + max_extent_size) + max_extent_size = + block_group->free_space_ctl->free_space; + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size); + num_bytes, empty_size, + &max_extent_size); /* * If we didn't find a chunk, and we haven't failed on this * block group before, and this block group is in the middle of @@ -4409,15 +6751,13 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: - search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); - goto loop; - } + search_start = stripe_align(root, block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > @@ -4426,45 +6766,41 @@ checks: goto loop; } - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); + if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= block_group->key.objectid && - search_start < (block_group->key.objectid + - block_group->key.offset)) - goto have_block_group; goto loop; } + /* we are all good, lets return */ ins->objectid = search_start; ins->offset = num_bytes; - if (offset < search_start) - btrfs_add_free_space(block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); - - update_reserved_extents(block_group, num_bytes, 1); - - /* we are all good, lets return */ + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; - btrfs_put_block_group(block_group); + BUG_ON(index != get_block_group_index(block_group)); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -4472,84 +6808,55 @@ loop: * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && - (found_uncached_bg || empty_size || empty_cluster || - allowed_chunk_alloc)) { - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent && - atomic_read(&space_info->caching_threads)) - goto search; + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { + index = 0; + loop++; + if (loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = do_chunk_alloc(trans, root, flags, + CHUNK_ALLOC_FORCE); /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. + * Do not bail out on ENOSPC since we + * can do more things. */ - loop = LOOP_CACHING_WAIT; - goto search; - } - - if (loop < LOOP_CACHING_WAIT) { - loop++; - goto search; + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, + root, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans, root); + if (ret) + goto out; } - if (loop == LOOP_ALLOC_CHUNK) { + if (loop == LOOP_NO_EMPTY_SIZE) { empty_size = 0; empty_cluster = 0; } - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - allowed_chunk_alloc = 0; - done_chunk_alloc = 1; - } else if (!done_chunk_alloc) { - space_info->force_alloc = 1; - } - - if (loop < LOOP_NO_EMPTY_SIZE) { - loop++; - goto search; - } - ret = -ENOSPC; + goto search; } else if (!ins->objectid) { ret = -ENOSPC; - } - - /* we found what we needed */ - if (ins->objectid) { - if (!(data & BTRFS_BLOCK_GROUP_DATA)) - trans->block_group = block_group->key.objectid; - - btrfs_put_block_group(block_group); + } else if (ins->objectid) { ret = 0; } - +out: + if (ret == -ENOSPC) + ins->offset = max_extent_size; return ret; } @@ -4557,113 +6864,121 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { struct btrfs_block_group_cache *cache; + int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_super), + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" - "\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_delalloc, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_root, - (unsigned long long)info->bytes_super, - (unsigned long long)info->bytes_reserved); + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); - list_for_each_entry(cache, &info->block_groups, list) { +again: + list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, int is_data, int delalloc) { + bool final_tried = false; + u64 flags; int ret; - u64 search_start = 0; - data = btrfs_get_alloc_profile(root, data); + flags = btrfs_get_alloc_profile(root, is_data); again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, 0); - WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, ins, - trans->alloc_exclude_start, - trans->alloc_exclude_nr, data); - - if (ret == -ENOSPC && num_bytes > min_alloc_size) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); - num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, 1); - goto again; - } + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags, delalloc); + if (ret == -ENOSPC) { - struct btrfs_space_info *sinfo; + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); + num_bytes = round_down(num_bytes, root->sectorsize); + num_bytes = max(num_bytes, min_alloc_size); + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + sinfo = __find_space_info(root->fs_info, flags); + btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", + flags, num_bytes); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); + } } return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { - printk(KERN_ERR "Unable to find block group for %llu\n", - (unsigned long long)start); + btrfs_err(root->fs_info, "Unable to find block group for %llu", + start); return -ENOSPC; } - ret = btrfs_discard_extent(root, start, len); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - update_reserved_extents(cache, len, 0); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + } btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, start, len); + return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int delalloc) +{ + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1, 0); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -4687,12 +7002,16 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -4721,14 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); - if (ret) { - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + + ret = update_block_group(root, ins->objectid, ins->offset, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); BUG(); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); return ret; } @@ -4736,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4745,15 +7071,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!skinny_metadata) + size += sizeof(*block_info); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); + return -ENOMEM; + } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->leafsize); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -4762,12 +7103,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, @@ -4782,14 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); - if (ret) { - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + + ret = update_block_group(root, ins->objectid, root->leafsize, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); BUG(); } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); return ret; } @@ -4802,9 +7156,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, - 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL); + ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ins->offset, 0, + root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL, 0); return ret; } @@ -4820,116 +7175,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); - } else { - mutex_lock(&caching_ctl->mutex); - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); - - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); - } - - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); + if (ret) + return ret; } - update_reserved_extents(block_group, ins->offset, 1); - btrfs_put_block_group(block_group); + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT, 0); + BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); + btrfs_put_block_group(block_group); return ret; } -/* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns 0 if everything worked, non-zero otherwise. - */ -static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) -{ - int ret; - u64 flags = 0; - - ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); - if (ret) - return ret; - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); - BUG_ON(ret); - } - - if (root_objectid == root->root_key.objectid) { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) + num_bytes; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - } - return ret; -} - -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level) +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u32 blocksize, int level) { struct extent_buffer *buf; @@ -4937,9 +7209,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); - btrfs_set_buffer_lockdep_class(buf, level); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -4964,8 +7237,72 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, return buf; } +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; + } + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} + +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); +} + /* - * helper function to allocate a block for a given tree + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, @@ -4975,18 +7312,68 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 hint, u64 empty_size) { struct btrfs_key ins; - int ret; + struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + u64 flags = 0; + int ret; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + ret = btrfs_reserve_extent(root, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); if (ret) { - BUG_ON(ret > 0); + unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); } buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); + BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = btrfs_alloc_delayed_extent_op(); + BUG_ON(!extent_op); /* -ENOMEM */ + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + if (skinny_metadata) + extent_op->update_key = 0; + else + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + extent_op->level = level; + + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + BUG_ON(ret); /* -ENOMEM */ + } return buf; } @@ -5001,6 +7388,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int for_reloc; }; #define DROP_REFERENCE 1 @@ -5015,7 +7403,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 generation; u64 refs; u64 flags; - u64 last = 0; u32 nritems; u32 blocksize; struct btrfs_key key; @@ -5053,9 +7440,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); - BUG_ON(ret); + ret = btrfs_lookup_extent_info(trans, root, bytenr, + wc->level - 1, 1, &refs, + &flags); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; BUG_ON(refs == 0); if (wc->stage == DROP_REFERENCE) { @@ -5083,14 +7473,13 @@ reada: generation); if (ret) break; - last = bytenr + blocksize; nread++; } wc->reada_slot = slot; } /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. @@ -5120,10 +7509,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; BUG_ON(wc->refs[level] == 0); } @@ -5132,7 +7523,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, return 1; if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -5141,13 +7532,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); - BUG_ON(ret); + eb->len, flag, + btrfs_header_level(eb), 0); + BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5156,14 +7548,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * keep the tree lock */ if (path->locks[level] && level > 0) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; } /* - * hepler to process tree block pointer. + * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block @@ -5211,16 +7603,25 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); reada = 1; } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); - BUG_ON(ret); - BUG_ON(wc->refs[level - 1] == 0); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(root->fs_info, "Missing references."); + BUG(); + } *lookup_info = 0; if (wc->stage == DROP_REFERENCE) { @@ -5248,7 +7649,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto skip; } - if (!btrfs_buffer_uptodate(next, generation)) { + if (!btrfs_buffer_uptodate(next, generation, 0)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; @@ -5259,6 +7660,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); + if (!next || !extent_buffer_uptodate(next)) { + free_extent_buffer(next); + return -EIO; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -5267,7 +7672,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, BUG_ON(level != btrfs_header_level(next)); path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -5285,8 +7690,8 @@ skip: } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); - BUG_ON(ret); + root->root_key.objectid, level - 1, 0, 0); + BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); free_extent_buffer(next); @@ -5295,7 +7700,7 @@ skip: } /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. @@ -5311,7 +7716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - int ret = 0; + int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; @@ -5338,16 +7743,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(level == 0); btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return ret; + } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return 1; } @@ -5360,17 +7769,19 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1); + ret = btrfs_dec_ref(trans, root, eb, 1, + wc->for_reloc); else - ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, eb, 0, + wc->for_reloc); + BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(trans, root, eb); } @@ -5389,13 +7800,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, - root->root_key.objectid, level, 0); - BUG_ON(ret); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; - return ret; + return 0; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -5451,7 +7860,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; if (path->locks[level]) { - btrfs_tree_unlock(path->nodes[level]); + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); @@ -5472,8 +7882,12 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5484,21 +7898,36 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) int err = 0; int ret; int level; + bool root_dropped = false; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; + } + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } - trans = btrfs_start_transaction(tree_root, 1); + if (block_rsv) + trans->block_rsv = block_rsv; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -5513,7 +7942,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) path->lowest_level = 0; if (ret < 0) { err = ret; - goto out; + goto out_end_trans; } WARN_ON(ret > 0); @@ -5527,19 +7956,23 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) while (1) { btrfs_tree_lock(path->nodes[level]); btrfs_set_lock_blocking(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, - path->nodes[level]->len, - &wc->refs[level], + level, 1, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_end_trans; + } BUG_ON(wc->refs[level] == 0); if (level == root_item->drop_level) break; btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } @@ -5550,9 +7983,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { + ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { err = ret; @@ -5579,52 +8014,86 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) } BUG_ON(wc->level == 0); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } - btrfs_end_transaction(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 1); - } else { - unsigned long update; - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, tree_root, - update); + btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("BTRFS: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + if (block_rsv) + trans->block_rsv = block_rsv; } } - btrfs_release_path(root, path); - BUG_ON(err); + btrfs_release_path(path); + if (err) + goto out_end_trans; ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); - BUG_ON(ret < 0); - if (ret > 0) { - ret = btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - BUG_ON(ret); + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); } } - if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } -out: - btrfs_end_transaction(trans, tree_root); + root_dropped = true; +out_end_trans: + btrfs_end_transaction_throttle(trans, tree_root); +out_free: kfree(wc); btrfs_free_path(path); +out: + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (!for_reloc && root_dropped == false) + btrfs_add_dead_root(root); + if (err && err != -EAGAIN) + btrfs_std_error(root->fs_info, err); return err; } @@ -5632,6 +8101,7 @@ out: * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5648,10 +8118,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); @@ -5663,7 +8137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -5672,6 +8146,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -5693,1574 +8168,221 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -#if 0 -static unsigned long calc_ra(unsigned long start, unsigned long last, - unsigned long nr) -{ - return min(last, start + nr - 1); -} - -static noinline int relocate_inode_pages(struct inode *inode, u64 start, - u64 len) +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { - u64 page_start; - u64 page_end; - unsigned long first_index; - unsigned long last_index; - unsigned long i; - struct page *page; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - unsigned int total_read = 0; - unsigned int total_dirty = 0; - int ret = 0; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); + u64 num_devices; + u64 stripped; - mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); - /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret) - goto out_unlock; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; - file_ra_state_init(ra, inode->i_mapping); + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - calc_ra(i, last_index, ra->ra_pages)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); - if (!page) { - ret = -ENOMEM; - goto out_unlock; - } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - ret = -EIO; - goto out_unlock; - } - } - wait_on_page_writeback(page); + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - set_page_extent_mapped(page); + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); - btrfs_set_extent_delalloc(inode, page_start, page_end); + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; - set_page_dirty(page); - total_dirty++; + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); + /* this is drive concat, leave it alone */ } -out_unlock: - kfree(ra); - mutex_unlock(&inode->i_mutex); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); - return ret; + return flags; } -static noinline int relocate_data_extent(struct inode *reloc_inode, - struct btrfs_key *extent_key, - u64 offset) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - offset; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(GFP_NOFS); - BUG_ON(!em || IS_ERR(em)); - - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(reloc_inode, start, end, 0); - } - unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); - - return relocate_inode_pages(reloc_inode, start, extent_key->offset); -} - -struct btrfs_ref_path { - u64 extent_start; - u64 nodes[BTRFS_MAX_LEVEL]; - u64 root_objectid; - u64 root_generation; - u64 owner_objectid; - u32 num_refs; - int lowest_level; - int current_level; - int shared_level; - - struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; - u64 new_nodes[BTRFS_MAX_LEVEL]; -}; - -struct disk_extent { - u64 ram_bytes; - u64 disk_bytenr; - u64 disk_num_bytes; - u64 offset; + struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; - u8 compression; - u8 encryption; - u16 other_encoding; -}; - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) - return 1; - return 0; -} - -static noinline int __next_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path, - int first_time) -{ - struct extent_buffer *leaf; - struct btrfs_path *path; - struct btrfs_extent_ref *ref; - struct btrfs_key key; - struct btrfs_key found_key; - u64 bytenr; - u32 nritems; - int level; - int ret = 1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (first_time) { - ref_path->lowest_level = -1; - ref_path->current_level = -1; - ref_path->shared_level = -1; - goto walk_up; - } -walk_down: - level = ref_path->current_level - 1; - while (level >= -1) { - u64 parent; - if (level < ref_path->lowest_level) - break; - - if (level >= 0) - bytenr = ref_path->nodes[level]; - else - bytenr = ref_path->extent_start; - BUG_ON(bytenr == 0); + u64 min_allocable_bytes; + int ret = -ENOSPC; - parent = ref_path->nodes[level + 1]; - ref_path->nodes[level + 1] = 0; - ref_path->current_level = level; - BUG_ON(parent == 0); - key.objectid = bytenr; - key.offset = parent + 1; - key.type = BTRFS_EXTENT_REF_KEY; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = 1 * 1024 * 1024; + else + min_allocable_bytes = 0; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret > 0) - goto next; - leaf = path->nodes[0]; - } + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid == bytenr && - found_key.type == BTRFS_EXTENT_REF_KEY) { - if (level < ref_path->shared_level) - ref_path->shared_level = level; - goto found; - } -next: - level--; - btrfs_release_path(extent_root, path); - cond_resched(); + if (cache->ro) { + ret = 0; + goto out; } - /* reached lowest level */ - ret = 1; - goto out; -walk_up: - level = ref_path->current_level; - while (level < BTRFS_MAX_LEVEL - 1) { - u64 ref_objectid; - - if (level >= 0) - bytenr = ref_path->nodes[level]; - else - bytenr = ref_path->extent_start; - - BUG_ON(bytenr == 0); - - key.objectid = bytenr; - key.offset = 0; - key.type = BTRFS_EXTENT_REF_KEY; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret > 0) { - /* the extent was freed by someone */ - if (ref_path->lowest_level == level) - goto out; - btrfs_release_path(extent_root, path); - goto walk_down; - } - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != bytenr || - found_key.type != BTRFS_EXTENT_REF_KEY) { - /* the extent was freed by someone */ - if (ref_path->lowest_level == level) { - ret = 1; - goto out; - } - btrfs_release_path(extent_root, path); - goto walk_down; - } -found: - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - ref_objectid = btrfs_ref_objectid(leaf, ref); - if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { - if (first_time) { - level = (int)ref_objectid; - BUG_ON(level >= BTRFS_MAX_LEVEL); - ref_path->lowest_level = level; - ref_path->current_level = level; - ref_path->nodes[level] = bytenr; - } else { - WARN_ON(ref_objectid != level); - } - } else { - WARN_ON(level != -1); - } - first_time = 0; - - if (ref_path->lowest_level == level) { - ref_path->owner_objectid = ref_objectid; - ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); - } - - /* - * the block is tree root or the block isn't in reference - * counted tree. - */ - if (found_key.objectid == found_key.offset || - is_cowonly_root(btrfs_ref_root(leaf, ref))) { - ref_path->root_objectid = btrfs_ref_root(leaf, ref); - ref_path->root_generation = - btrfs_ref_generation(leaf, ref); - if (level < 0) { - /* special reference from the tree log */ - ref_path->nodes[0] = found_key.offset; - ref_path->current_level = 0; - } - ret = 0; - goto out; - } - - level++; - BUG_ON(ref_path->nodes[level] != 0); - ref_path->nodes[level] = found_key.offset; - ref_path->current_level = level; - /* - * the reference was created in the running transaction, - * no need to continue walking up. - */ - if (btrfs_ref_generation(leaf, ref) == trans->transid) { - ref_path->root_objectid = btrfs_ref_root(leaf, ref); - ref_path->root_generation = - btrfs_ref_generation(leaf, ref); - ret = 0; - goto out; - } + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); - btrfs_release_path(extent_root, path); - cond_resched(); + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + cache->ro = 1; + ret = 0; } - /* reached max tree level, but no tree root found. */ - BUG(); out: - btrfs_free_path(path); + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); return ret; } -static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path, - u64 extent_start) -{ - memset(ref_path, 0, sizeof(*ref_path)); - ref_path->extent_start = extent_start; - - return __next_ref_path(trans, extent_root, ref_path, 1); -} - -static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path) -{ - return __next_ref_path(trans, extent_root, ref_path, 0); -} +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) -static noinline int get_new_locations(struct inode *reloc_inode, - struct btrfs_key *extent_key, - u64 offset, int no_fragment, - struct disk_extent **extents, - int *nr_extents) { - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - struct disk_extent *exts = *extents; - struct btrfs_key found_key; - u64 cur_pos; - u64 last_byte; - u32 nritems; - int nr = 0; - int max = *nr_extents; + struct btrfs_trans_handle *trans; + u64 alloc_flags; int ret; - WARN_ON(!no_fragment && *extents); - if (!exts) { - max = 1; - exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); - if (!exts) - return -ENOMEM; - } - - path = btrfs_alloc_path(); - BUG_ON(!path); - - cur_pos = extent_key->objectid - offset; - last_byte = extent_key->objectid + extent_key->offset; - ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, - cur_pos, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.offset != cur_pos || - found_key.type != BTRFS_EXTENT_DATA_KEY || - found_key.objectid != reloc_inode->i_ino) - break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != - BTRFS_FILE_EXTENT_REG || - btrfs_file_extent_disk_bytenr(leaf, fi) == 0) - break; + BUG_ON(cache->ro); - if (nr == max) { - struct disk_extent *old = exts; - max *= 2; - exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); - memcpy(exts, old, sizeof(*exts) * nr); - if (old != *extents) - kfree(old); - } - - exts[nr].disk_bytenr = - btrfs_file_extent_disk_bytenr(leaf, fi); - exts[nr].disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - exts[nr].offset = btrfs_file_extent_offset(leaf, fi); - exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - exts[nr].compression = btrfs_file_extent_compression(leaf, fi); - exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); - exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, - fi); - BUG_ON(exts[nr].offset > 0); - BUG_ON(exts[nr].compression || exts[nr].encryption); - BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); - - cur_pos += exts[nr].num_bytes; - nr++; - - if (cur_pos + offset >= last_byte) - break; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - if (no_fragment) { - ret = 1; + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) goto out; - } - path->slots[0]++; } - BUG_ON(cur_pos + offset > last_byte); - if (cur_pos + offset < last_byte) { - ret = -ENOENT; + ret = set_block_group_ro(cache, 0); + if (!ret) goto out; - } - ret = 0; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache, 0); out: - btrfs_free_path(path); - if (ret) { - if (exts != *extents) - kfree(exts); - } else { - *extents = exts; - *nr_extents = nr; - } + btrfs_end_transaction(trans, root); return ret; } -static noinline int replace_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *extent_key, - struct btrfs_key *leaf_key, - struct btrfs_ref_path *ref_path, - struct disk_extent *new_extents, - int nr_extents) +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) { - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; - struct btrfs_key key; - u64 lock_start = 0; - u64 lock_end = 0; - u64 num_bytes; - u64 ext_offset; - u64 search_end = (u64)-1; - u32 nritems; - int nr_scaned = 0; - int extent_locked = 0; - int extent_type; - int ret; - - memcpy(&key, leaf_key, sizeof(key)); - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { - if (key.objectid < ref_path->owner_objectid || - (key.objectid == ref_path->owner_objectid && - key.type < BTRFS_EXTENT_DATA_KEY)) { - key.objectid = ref_path->owner_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - } - } - - while (1) { - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); -next: - if (extent_locked && ret > 0) { - /* - * the file extent item was modified by someone - * before the extent got locked. - */ - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - extent_locked = 0; - } - - if (path->slots[0] >= nritems) { - if (++nr_scaned > 2) - break; - - BUG_ON(extent_locked); - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { - if ((key.objectid > ref_path->owner_objectid) || - (key.objectid == ref_path->owner_objectid && - key.type > BTRFS_EXTENT_DATA_KEY) || - key.offset >= search_end) - break; - } - - if (inode && key.objectid != inode->i_ino) { - BUG_ON(extent_locked); - btrfs_release_path(root, path); - mutex_unlock(&inode->i_mutex); - iput(inode); - inode = NULL; - continue; - } - - if (key.type != BTRFS_EXTENT_DATA_KEY) { - path->slots[0]++; - ret = 1; - goto next; - } - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if ((extent_type != BTRFS_FILE_EXTENT_REG && - extent_type != BTRFS_FILE_EXTENT_PREALLOC) || - (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid)) { - path->slots[0]++; - ret = 1; - goto next; - } - - num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - ext_offset = btrfs_file_extent_offset(leaf, fi); - - if (search_end == (u64)-1) { - search_end = key.offset - ext_offset + - btrfs_file_extent_ram_bytes(leaf, fi); - } - - if (!extent_locked) { - lock_start = key.offset; - lock_end = lock_start + num_bytes - 1; - } else { - if (lock_start > key.offset || - lock_end + 1 < key.offset + num_bytes) { - unlock_extent(&BTRFS_I(inode)->io_tree, - lock_start, lock_end, GFP_NOFS); - extent_locked = 0; - } - } - - if (!inode) { - btrfs_release_path(root, path); - - inode = btrfs_iget_locked(root->fs_info->sb, - key.objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = - key.objectid; - BTRFS_I(inode)->location.type = - BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - /* - * some code call btrfs_commit_transaction while - * holding the i_mutex, so we can't use mutex_lock - * here. - */ - if (is_bad_inode(inode) || - !mutex_trylock(&inode->i_mutex)) { - iput(inode); - inode = NULL; - key.offset = (u64)-1; - goto skip; - } - } - - if (!extent_locked) { - struct btrfs_ordered_extent *ordered; - - btrfs_release_path(root, path); - - lock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - lock_end); - if (ordered && - ordered->file_offset <= lock_end && - ordered->file_offset + ordered->len > lock_start) { - unlock_extent(&BTRFS_I(inode)->io_tree, - lock_start, lock_end, GFP_NOFS); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - key.offset += num_bytes; - goto skip; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - - extent_locked = 1; - continue; - } - - if (nr_extents == 1) { - /* update extent pointer in place */ - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extents[0].disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extents[0].disk_num_bytes); - btrfs_mark_buffer_dirty(leaf); - - btrfs_drop_extent_cache(inode, key.offset, - key.offset + num_bytes - 1, 0); - - ret = btrfs_inc_extent_ref(trans, root, - new_extents[0].disk_bytenr, - new_extents[0].disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, - key.objectid); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, - extent_key->objectid, - extent_key->offset, - leaf->start, - btrfs_header_owner(leaf), - btrfs_header_generation(leaf), - key.objectid, 0); - BUG_ON(ret); - - btrfs_release_path(root, path); - key.offset += num_bytes; - } else { - BUG_ON(1); -#if 0 - u64 alloc_hint; - u64 extent_len; - int i; - /* - * drop old extent pointer at first, then insert the - * new pointers one bye one - */ - btrfs_release_path(root, path); - ret = btrfs_drop_extents(trans, root, inode, key.offset, - key.offset + num_bytes, - key.offset, &alloc_hint); - BUG_ON(ret); - - for (i = 0; i < nr_extents; i++) { - if (ext_offset >= new_extents[i].num_bytes) { - ext_offset -= new_extents[i].num_bytes; - continue; - } - extent_len = min(new_extents[i].num_bytes - - ext_offset, num_bytes); - - ret = btrfs_insert_empty_item(trans, root, - path, &key, - sizeof(*fi)); - BUG_ON(ret); - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, - trans->transid); - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extents[i].disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extents[i].disk_num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, - new_extents[i].ram_bytes); - - btrfs_set_file_extent_compression(leaf, fi, - new_extents[i].compression); - btrfs_set_file_extent_encryption(leaf, fi, - new_extents[i].encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, - new_extents[i].other_encoding); - - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len); - ext_offset += new_extents[i].offset; - btrfs_set_file_extent_offset(leaf, fi, - ext_offset); - btrfs_mark_buffer_dirty(leaf); - - btrfs_drop_extent_cache(inode, key.offset, - key.offset + extent_len - 1, 0); - - ret = btrfs_inc_extent_ref(trans, root, - new_extents[i].disk_bytenr, - new_extents[i].disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, key.objectid); - BUG_ON(ret); - btrfs_release_path(root, path); - - inode_add_bytes(inode, extent_len); - - ext_offset = 0; - num_bytes -= extent_len; - key.offset += extent_len; - - if (num_bytes == 0) - break; - } - BUG_ON(i >= nr_extents); -#endif - } - - if (extent_locked) { - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - extent_locked = 0; - } -skip: - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && - key.offset >= search_end) - break; - - cond_resched(); - } - ret = 0; -out: - btrfs_release_path(root, path); - if (inode) { - mutex_unlock(&inode->i_mutex); - if (extent_locked) { - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - } - iput(inode); - } - return ret; + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); } -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, u64 orig_start) -{ - int level; - int ret; - - BUG_ON(btrfs_header_generation(buf) != trans->transid); - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - - level = btrfs_header_level(buf); - if (level == 0) { - struct btrfs_leaf_ref *ref; - struct btrfs_leaf_ref *orig_ref; - - orig_ref = btrfs_lookup_leaf_ref(root, orig_start); - if (!orig_ref) - return -ENOENT; - - ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); - if (!ref) { - btrfs_free_leaf_ref(root, orig_ref); - return -ENOMEM; - } - - ref->nritems = orig_ref->nritems; - memcpy(ref->extents, orig_ref->extents, - sizeof(ref->extents[0]) * ref->nritems); - - btrfs_free_leaf_ref(root, orig_ref); - - ref->root_gen = trans->transid; - ref->bytenr = buf->start; - ref->owner = btrfs_header_owner(buf); - ref->generation = btrfs_header_generation(buf); - - ret = btrfs_add_leaf_ref(root, ref, 0); - WARN_ON(ret); - btrfs_free_leaf_ref(root, ref); - } - return 0; -} - -static noinline int invalidate_extent_cache(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_block_group_cache *group, - struct btrfs_root *target_root) -{ - struct btrfs_key key; - struct inode *inode = NULL; - struct btrfs_file_extent_item *fi; - struct extent_state *cached_state = NULL; - u64 num_bytes; - u64 skip_objectid = 0; - u32 nritems; - u32 i; - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(leaf, &key, i); - if (key.objectid == skip_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) - continue; - if (!inode || inode->i_ino != key.objectid) { - iput(inode); - inode = btrfs_ilookup(target_root->fs_info->sb, - key.objectid, target_root, 1); - } - if (!inode) { - skip_objectid = key.objectid; - continue; - } - num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, 0, &cached_state, - GFP_NOFS); - btrfs_drop_extent_cache(inode, key.offset, - key.offset + num_bytes - 1, 1); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, &cached_state, - GFP_NOFS); - cond_resched(); - } - iput(inode); - return 0; -} - -static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode) +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) { - struct btrfs_key key; - struct btrfs_key extent_key; - struct btrfs_file_extent_item *fi; - struct btrfs_leaf_ref *ref; - struct disk_extent *new_extent; - u64 bytenr; - u64 num_bytes; - u32 nritems; - u32 i; - int ext_index; - int nr_extent; - int ret; - - new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); - BUG_ON(!new_extent); - - ref = btrfs_lookup_leaf_ref(root, leaf->start); - BUG_ON(!ref); - - ext_index = -1; - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(leaf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (bytenr == 0) - continue; + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; - ext_index++; - if (bytenr >= group->key.objectid + group->key.offset || - bytenr + num_bytes <= group->key.objectid) - continue; + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); - extent_key.objectid = bytenr; - extent_key.offset = num_bytes; - extent_key.type = BTRFS_EXTENT_ITEM_KEY; - nr_extent = 1; - ret = get_new_locations(reloc_inode, &extent_key, - group->key.objectid, 1, - &new_extent, &nr_extent); - if (ret > 0) + if (!block_group->ro) { + spin_unlock(&block_group->lock); continue; - BUG_ON(ret < 0); - - BUG_ON(ref->extents[ext_index].bytenr != bytenr); - BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); - ref->extents[ext_index].bytenr = new_extent->disk_bytenr; - ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; - - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extent->disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extent->disk_num_bytes); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_inc_extent_ref(trans, root, - new_extent->disk_bytenr, - new_extent->disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, key.objectid); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, - bytenr, num_bytes, leaf->start, - btrfs_header_owner(leaf), - btrfs_header_generation(leaf), - key.objectid, 0); - BUG_ON(ret); - cond_resched(); - } - kfree(new_extent); - BUG_ON(ext_index + 1 != ref->nritems); - btrfs_free_leaf_ref(root, ref); - return 0; -} - -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - int ret; - - if (root->reloc_root) { - reloc_root = root->reloc_root; - root->reloc_root = NULL; - list_add(&reloc_root->dead_list, - &root->fs_info->dead_reloc_roots); - - btrfs_set_root_bytenr(&reloc_root->root_item, - reloc_root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(reloc_root->node)); - memset(&reloc_root->root_item.drop_progress, 0, - sizeof(struct btrfs_disk_key)); - reloc_root->root_item.drop_level = 0; - - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &reloc_root->root_key, - &reloc_root->root_item); - BUG_ON(ret); - } - return 0; -} - -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *reloc_root; - struct btrfs_root *prev_root = NULL; - struct list_head dead_roots; - int ret; - unsigned long nr; - - INIT_LIST_HEAD(&dead_roots); - list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); - - while (!list_empty(&dead_roots)) { - reloc_root = list_entry(dead_roots.prev, - struct btrfs_root, dead_list); - list_del_init(&reloc_root->dead_list); - - BUG_ON(reloc_root->commit_root != NULL); - while (1) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - - mutex_lock(&root->fs_info->drop_mutex); - ret = btrfs_drop_snapshot(trans, reloc_root); - if (ret != -EAGAIN) - break; - mutex_unlock(&root->fs_info->drop_mutex); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - btrfs_btree_balance_dirty(root, nr); } - free_extent_buffer(reloc_root->node); - - ret = btrfs_del_root(trans, root->fs_info->tree_root, - &reloc_root->root_key); - BUG_ON(ret); - mutex_unlock(&root->fs_info->drop_mutex); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - btrfs_btree_balance_dirty(root, nr); - - kfree(prev_root); - prev_root = reloc_root; - } - if (prev_root) { - btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); - kfree(prev_root); - } - return 0; -} - -int btrfs_add_dead_reloc_root(struct btrfs_root *root) -{ - list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); - return 0; -} - -int btrfs_cleanup_reloc_trees(struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct btrfs_trans_handle *trans; - struct btrfs_key location; - int found; - int ret; + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; - mutex_lock(&root->fs_info->tree_reloc_mutex); - ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); - BUG_ON(ret); - found = !list_empty(&root->fs_info->dead_reloc_roots); - mutex_unlock(&root->fs_info->tree_reloc_mutex); + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; - if (found) { - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + spin_unlock(&block_group->lock); } - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - location.offset = (u64)-1; - location.type = BTRFS_ROOT_ITEM_KEY; - - reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - BUG_ON(!reloc_root); - btrfs_orphan_cleanup(reloc_root); - return 0; -} - -static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb; - struct btrfs_root_item *root_item; - struct btrfs_key root_key; - int ret; - - BUG_ON(!root->ref_cows); - if (root->reloc_root) - return 0; - - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); - - ret = btrfs_copy_root(trans, root, root->commit_root, - &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); - - root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; - root_key.offset = root->root_key.objectid; - root_key.type = BTRFS_ROOT_ITEM_KEY; - - memcpy(root_item, &root->root_item, sizeof(root_item)); - btrfs_set_root_refs(root_item, 0); - btrfs_set_root_bytenr(root_item, eb->start); - btrfs_set_root_level(root_item, btrfs_header_level(eb)); - btrfs_set_root_generation(root_item, trans->transid); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - - ret = btrfs_insert_root(trans, root->fs_info->tree_root, - &root_key, root_item); - BUG_ON(ret); - kfree(root_item); - - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); - BUG_ON(!reloc_root); - reloc_root->last_trans = trans->transid; - reloc_root->commit_root = NULL; - reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; - - root->reloc_root = reloc_root; - return 0; + return free_bytes; } /* - * Core function of space balance. - * - * The idea is using reloc trees to relocate tree blocks in reference - * counted roots. There is one reloc tree for each subvol, and all - * reloc trees share same root key objectid. Reloc trees are snapshots - * of the latest committed roots of subvols (root->commit_root). - * - * To relocate a tree block referenced by a subvol, there are two steps. - * COW the block through subvol's reloc tree, then update block pointer - * in the subvol to point to the new block. Since all reloc trees share - * same root key objectid, doing special handing for tree blocks owned - * by them is easy. Once a tree block has been COWed in one reloc tree, - * we can use the resulting new block directly when the same block is - * required to COW again through other reloc trees. By this way, relocated - * tree blocks are shared between reloc trees, so they are also shared - * between subvols. + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. */ -static noinline int relocate_one_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *first_key, - struct btrfs_ref_path *ref_path, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb = NULL; - struct btrfs_key *keys; - u64 *nodes; - int level; - int shared_level; - int lowest_level = 0; - int ret; - - if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - lowest_level = ref_path->owner_objectid; - - if (!root->ref_cows) { - path->lowest_level = lowest_level; - ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); - BUG_ON(ret < 0); - path->lowest_level = 0; - btrfs_release_path(root, path); - return 0; - } - - mutex_lock(&root->fs_info->tree_reloc_mutex); - ret = init_reloc_tree(trans, root); - BUG_ON(ret); - reloc_root = root->reloc_root; - - shared_level = ref_path->shared_level; - ref_path->shared_level = BTRFS_MAX_LEVEL - 1; - - keys = ref_path->node_keys; - nodes = ref_path->new_nodes; - memset(&keys[shared_level + 1], 0, - sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); - memset(&nodes[shared_level + 1], 0, - sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); - - if (nodes[lowest_level] == 0) { - path->lowest_level = lowest_level; - ret = btrfs_search_slot(trans, reloc_root, first_key, path, - 0, 1); - BUG_ON(ret); - for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { - eb = path->nodes[level]; - if (!eb || eb == reloc_root->node) - break; - nodes[level] = eb->start; - if (level == 0) - btrfs_item_key_to_cpu(eb, &keys[level], 0); - else - btrfs_node_key_to_cpu(eb, &keys[level], 0); - } - if (nodes[0] && - ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - eb = path->nodes[0]; - ret = replace_extents_in_leaf(trans, reloc_root, eb, - group, reloc_inode); - BUG_ON(ret); - } - btrfs_release_path(reloc_root, path); - } else { - ret = btrfs_merge_path(trans, reloc_root, keys, nodes, - lowest_level); - BUG_ON(ret); - } - - /* - * replace tree blocks in the fs tree with tree blocks in - * the reloc tree. - */ - ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); - BUG_ON(ret < 0); - - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_search_slot(trans, reloc_root, first_key, path, - 0, 0); - BUG_ON(ret); - extent_buffer_get(path->nodes[0]); - eb = path->nodes[0]; - btrfs_release_path(reloc_root, path); - ret = invalidate_extent_cache(reloc_root, eb, group, root); - BUG_ON(ret); - free_extent_buffer(eb); - } - - mutex_unlock(&root->fs_info->tree_reloc_mutex); - path->lowest_level = 0; - return 0; -} - -static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *first_key, - struct btrfs_ref_path *ref_path) -{ - int ret; - - ret = relocate_one_path(trans, root, path, first_key, - ref_path, NULL, NULL); - BUG_ON(ret); - - return 0; -} - -static noinline int del_extent_zero(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct btrfs_key *extent_key) -{ - int ret; - - ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); - if (ret) - goto out; - ret = btrfs_del_item(trans, extent_root, path); -out: - btrfs_release_path(extent_root, path); - return ret; -} - -static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, - struct btrfs_ref_path *ref_path) -{ - struct btrfs_key root_key; - - root_key.objectid = ref_path->root_objectid; - root_key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(ref_path->root_objectid)) - root_key.offset = 0; - else - root_key.offset = (u64)-1; - - return btrfs_read_fs_root_no_name(fs_info, &root_key); -} - -static noinline int relocate_one_extent(struct btrfs_root *extent_root, - struct btrfs_path *path, - struct btrfs_key *extent_key, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode, int pass) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { - struct btrfs_trans_handle *trans; - struct btrfs_root *found_root; - struct btrfs_ref_path *ref_path = NULL; - struct disk_extent *new_extents = NULL; - int nr_extents = 0; - int loops; - int ret; - int level; - struct btrfs_key first_key; - u64 prev_block = 0; - - - trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(!trans); - - if (extent_key->objectid == 0) { - ret = del_extent_zero(trans, extent_root, path, extent_key); - goto out; - } - - ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); - if (!ref_path) { - ret = -ENOMEM; - goto out; - } - - for (loops = 0; ; loops++) { - if (loops == 0) { - ret = btrfs_first_ref_path(trans, extent_root, ref_path, - extent_key->objectid); - } else { - ret = btrfs_next_ref_path(trans, extent_root, ref_path); - } - if (ret < 0) - goto out; - if (ret > 0) - break; - - if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || - ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) - continue; - - found_root = read_ref_root(extent_root->fs_info, ref_path); - BUG_ON(!found_root); - /* - * for reference counted tree, only process reference paths - * rooted at the latest committed root. - */ - if (found_root->ref_cows && - ref_path->root_generation != found_root->root_key.offset) - continue; - - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - if (pass == 0) { - /* - * copy data extents to new locations - */ - u64 group_start = group->key.objectid; - ret = relocate_data_extent(reloc_inode, - extent_key, - group_start); - if (ret < 0) - goto out; - break; - } - level = 0; - } else { - level = ref_path->owner_objectid; - } - - if (prev_block != ref_path->nodes[level]) { - struct extent_buffer *eb; - u64 block_start = ref_path->nodes[level]; - u64 block_size = btrfs_level_size(found_root, level); - - eb = read_tree_block(found_root, block_start, - block_size, 0); - btrfs_tree_lock(eb); - BUG_ON(level != btrfs_header_level(eb)); - - if (level == 0) - btrfs_item_key_to_cpu(eb, &first_key, 0); - else - btrfs_node_key_to_cpu(eb, &first_key, 0); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - prev_block = block_start; - } - - mutex_lock(&extent_root->fs_info->trans_mutex); - btrfs_record_root_in_trans(found_root); - mutex_unlock(&extent_root->fs_info->trans_mutex); - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - /* - * try to update data extent references while - * keeping metadata shared between snapshots. - */ - if (pass == 1) { - ret = relocate_one_path(trans, found_root, - path, &first_key, ref_path, - group, reloc_inode); - if (ret < 0) - goto out; - continue; - } - /* - * use fallback method to process the remaining - * references. - */ - if (!new_extents) { - u64 group_start = group->key.objectid; - new_extents = kmalloc(sizeof(*new_extents), - GFP_NOFS); - nr_extents = 1; - ret = get_new_locations(reloc_inode, - extent_key, - group_start, 1, - &new_extents, - &nr_extents); - if (ret) - goto out; - } - ret = replace_one_extent(trans, found_root, - path, extent_key, - &first_key, ref_path, - new_extents, nr_extents); - } else { - ret = relocate_tree_block(trans, found_root, path, - &first_key, ref_path); - } - if (ret < 0) - goto out; - } - ret = 0; -out: - btrfs_end_transaction(trans, extent_root); - kfree(new_extents); - kfree(ref_path); - return ret; -} -#endif - -static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) -{ - u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - num_devices = root->fs_info->fs_devices->rw_devices; - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; + int i; + u64 free_bytes = 0; - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - return flags; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; + spin_lock(&sinfo->lock); - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; + spin_unlock(&sinfo->lock); - /* turn single device chunks into raid0 */ - return stripped | BTRFS_BLOCK_GROUP_RAID0; - } - return flags; + return free_bytes; } -static int __alloc_chunk_for_shrink(struct btrfs_root *root, - struct btrfs_block_group_cache *shrink_block_group, - int force) +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - struct btrfs_trans_handle *trans; - u64 new_alloc_flags; - u64 calc; - - spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { - spin_unlock(&shrink_block_group->lock); - - trans = btrfs_start_transaction(root, 1); - spin_lock(&shrink_block_group->lock); - - new_alloc_flags = update_block_group_flags(root, - shrink_block_group->flags); - if (new_alloc_flags != shrink_block_group->flags) { - calc = - btrfs_block_group_used(&shrink_block_group->item); - } else { - calc = shrink_block_group->key.offset; - } - spin_unlock(&shrink_block_group->lock); - - do_chunk_alloc(trans, root->fs_info->extent_root, - calc + 2 * 1024 * 1024, new_alloc_flags, force); - - btrfs_end_transaction(trans, root); - } else - spin_unlock(&shrink_block_group->lock); - return 0; -} - + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) + BUG_ON(!cache->ro); -{ - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); - return 0; + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); } /* @@ -7275,6 +8397,12 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; + u64 min_free; + u64 dev_min = 1; + u64 dev_nr = 0; + u64 target; + int index; int full = 0; int ret = 0; @@ -7284,8 +8412,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (!block_group) return -1; + min_free = btrfs_block_group_used(&block_group->item); + /* no bytes used, we're good */ - if (!btrfs_block_group_used(&block_group->item)) + if (!min_free) goto out; space_info = block_group->space_info; @@ -7301,10 +8431,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * all of the extents from this block group. If we can, we're good */ if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - btrfs_block_group_used(&block_group->item) < - space_info->total_bytes)) { + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + min_free < space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -7313,32 +8442,78 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * ok we don't have enough space, but maybe we have free space on our * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. However, if we - * were marked as full, then we know there aren't enough chunks, and we - * can just return. + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. */ ret = -1; - if (full) + + /* + * index: + * 0: raid10 + * 1: raid1 + * 2: dup + * 3: raid0 + * 4: single + */ + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_raid_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + + if (index == BTRFS_RAID_RAID10) { + dev_min = 4; + /* Divide by 2 */ + min_free >>= 1; + } else if (index == BTRFS_RAID_RAID1) { + dev_min = 2; + } else if (index == BTRFS_RAID_DUP) { + /* Multiply by 2 */ + min_free <<= 1; + } else if (index == BTRFS_RAID_RAID0) { + dev_min = fs_devices->rw_devices; + do_div(min_free, dev_min); + } + + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto out; + } mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { + ret = find_free_dev_extent(trans, device, min_free, + &dev_offset, NULL); if (!ret) + dev_nr++; + + if (dev_nr >= dev_min) break; + ret = -1; } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -7380,6 +8555,40 @@ out: return ret; } +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = next_block_group(info->tree_root, + block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; @@ -7387,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -7411,6 +8620,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + free_excluded_extents(info->extent_root, block_group); + btrfs_remove_free_space_cache(block_group); btrfs_put_block_group(block_group); @@ -7426,17 +8643,108 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); - while(!list_empty(&info->space_info)) { + release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) { + dump_space_info(space_info, 0, 0); + } + } list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + bool first = false; + + down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) { + struct raid_kobject *rkobj; + int ret; + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + goto out_err; + } + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -7447,6 +8755,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; root = info->extent_root; key.objectid = 0; @@ -7455,48 +8765,71 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; + + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (btrfs_test_opt(root, SPACE_CACHE) && + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(root, CLEAR_CACHE)) + need_clear = 1; while (1) { ret = find_first_block_group(root, path, &key); - if (ret > 0) { - ret = 0; - goto error; - } + if (ret > 0) + break; if (ret != 0) goto error; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; - break; + goto error; } - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - spin_lock_init(&cache->tree_lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - - /* - * we only want to have 32k of ram per block group for keeping - * track of free space, and if we pass 1/2 of that we want to - * start converting things over to using bitmaps - */ - cache->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(root, path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; + btrfs_release_path(path); + + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -7506,12 +8839,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, @@ -7521,32 +8852,99 @@ int btrfs_read_block_groups(struct btrfs_root *root) free_excluded_extents(root, cache); } + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + btrfs_put_block_group(cache); + goto error; + } + cache->space_info = space_info; spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups); - up_write(&space_info->groups_sem); - - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_readonly(cache); + set_block_group_ro(cache, 1); } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + set_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + set_block_group_ro(cache, 1); + } + + init_global_block_rsv(info); ret = 0; error: btrfs_free_path(path); return ret; } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, + new_bg_list) { + list_del_init(&block_group->new_bg_list); + + if (ret) + continue; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + } +} + int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7558,76 +8956,96 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - - /* - * we only want to have 32k of ram per block group for keeping track - * of free space, and if we pass 1/2 of that we want to start - * converting things over to using bitmaps - */ - cache->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - spin_lock_init(&cache->tree_lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + btrfs_put_block_group(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); free_excluded_extents(root, cache); + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return ret; + } + update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&cache->space_info->groups_sem); - list_add_tail(&cache->list, &cache->space_info->block_groups); - up_write(&cache->space_info->groups_sem); + __link_block_group(cache->space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); - - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - BUG_ON(ret); + list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); return 0; } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; + struct inode *inode; + struct kobject *kobj = NULL; int ret; + int index; + int factor; root = root->fs_info->extent_root; @@ -7635,7 +9053,20 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group); BUG_ON(!block_group->ro); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + free_excluded_extents(root, block_group); + memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; /* make sure this block group isn't part of an allocation cluster */ cluster = &root->fs_info->data_alloc_cluster; @@ -7653,11 +9084,55 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(tree_root, block_group, path); + if (!IS_ERR(inode)) { + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + btrfs_add_delayed_iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -7666,7 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); @@ -7676,8 +9160,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; spin_unlock(&block_group->space_info->lock); + memcpy(&key, &block_group->key, sizeof(key)); + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); @@ -7694,3 +9181,149 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return 1; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } +out: + return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 group_trimmed; + u64 start; + u64 end; + u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + int ret = 0; + + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); + + while (cache) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->key.objectid); + end = min(range->start + range->len, + cache->key.objectid + cache->key.offset); + + if (end - start >= range->minlen) { + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, 0); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + + cache = next_block_group(fs_info->tree_root, cache); + } + + range->len = trimmed; + return ret; +} + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ + if (unlikely(atomic_read(&root->will_be_snapshoted))) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (unlikely(atomic_read(&root->will_be_snapshoted))) { + btrfs_end_nocow_write(root); + return 0; + } + return 1; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d03684fab..a389820d158 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,27 +4,103 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/prefetch.h> +#include <linux/cleancache.h> #include "extent_io.h" #include "extent_map.h" -#include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" +#include "check-integrity.h" +#include "locking.h" +#include "rcu-string.h" +#include "backref.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; +#ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); -#define LEAK_DEBUG 0 -#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); + +static inline +void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(new, head); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_del(struct list_head *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(entry); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_check(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + state->start, state->end, state->state, state->tree, + atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " + "refs %d\n", + eb->start, eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, u64 start, u64 end) +{ + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + + inode = tree->mapping->host; + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, btrfs_ino(inode), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add(new, head) do {} while (0) +#define btrfs_leak_debug_del(entry) do {} while (0) +#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -39,6 +115,7 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -49,75 +126,83 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + if (!tree->mapping) + return NULL; + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + return 0; +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } void extent_io_exit(void) { - struct extent_state *state; - struct extent_buffer *eb; + btrfs_leak_debug_check(); - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - - } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping, gfp_t mask) + struct address_space *mapping) { tree->state = RB_ROOT; - tree->buffer = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#if LEAK_DEBUG - unsigned long flags; -#endif state = kmem_cache_alloc(extent_state_cache, mask); if (!state) @@ -125,41 +210,43 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&state->leak_list, &states); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#if LEAK_DEBUG - unsigned long flags; -#endif WARN_ON(state->tree); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_del(&state->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&state->leak_list); + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) { - struct rb_node **p = &root->rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct tree_entry *entry; + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + + p = search_start ? &search_start : &root->rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -172,36 +259,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); +do_insert: rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; + struct rb_node **n = &root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); prev_entry = entry; if (offset < entry->start) - n = n->rb_left; + n = &(*n)->rb_left; else if (offset > entry->end) - n = n->rb_right; + n = &(*n)->rb_right; else - return n; + return *n; } + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + if (prev_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { @@ -223,60 +317,25 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, return NULL; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_node *prev = NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL); + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); if (!ret) return prev; return ret; } -static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, - u64 offset, struct rb_node *node) +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) { - struct rb_root *root = &tree->buffer; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct extent_buffer *eb; - - while (*p) { - parent = *p; - eb = rb_entry(parent, struct extent_buffer, rb_node); - - if (offset < eb->start) - p = &(*p)->rb_left; - else if (offset > eb->start) - p = &(*p)->rb_right; - else - return eb; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct extent_buffer *buffer_search(struct extent_io_tree *tree, - u64 offset) -{ - struct rb_root *root = &tree->buffer; - struct rb_node *n = root->rb_node; - struct extent_buffer *eb; - - while (n) { - eb = rb_entry(n, struct extent_buffer, rb_node); - if (offset < eb->start) - n = n->rb_left; - else if (offset > eb->start) - n = n->rb_right; - else - return eb; - } - return NULL; + return tree_search_for_insert(tree, offset, NULL, NULL); } static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, @@ -296,14 +355,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, * * This should be called with the tree lock held. */ -static int merge_state(struct extent_io_tree *tree, - struct extent_state *state) +static void merge_state(struct extent_io_tree *tree, + struct extent_state *state) { struct extent_state *other; struct rb_node *other_node; if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - return 0; + return; other_node = rb_prev(&state->rb_node); if (other_node) { @@ -323,38 +382,31 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } - - return 0; } -static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) +static void set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, unsigned long *bits) { - if (tree->ops && tree->ops->set_bit_hook) { - return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); - } - - return 0; + if (tree->ops && tree->ops->set_bit_hook) + tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, unsigned long *bits); + /* * insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. @@ -367,35 +419,27 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + struct rb_node ***p, + struct rb_node **parent, + unsigned long *bits) { struct rb_node *node; - int ret; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", - (unsigned long long)end, - (unsigned long long)start); - WARN_ON(1); - } + if (end < start) + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", + end, start); state->start = start; state->end = end; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; - if (bits & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits; - node = tree_insert(&tree->state, end, &state->rb_node); + set_state_bits(tree, state, bits); + + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " - "%llu %llu\n", (unsigned long long)found->start, - (unsigned long long)found->end, - (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " + "%llu %llu\n", + found->start, found->end, start, end); return -EEXIST; } state->tree = tree; @@ -403,13 +447,11 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, +static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - return tree->ops->split_extent_hook(tree->mapping->host, - orig, split); - return 0; + tree->ops->split_extent_hook(tree->mapping->host, orig, split); } /* @@ -438,7 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -447,22 +490,30 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; - int ret = state->state & bits_to_clear; + struct extent_state *next; + unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +522,9 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { + next = next_state(state); if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -482,8 +533,25 @@ static int clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } - return ret; + return next; +} + +static struct extent_state * +alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); } /* @@ -496,24 +564,30 @@ static int clear_state_bit(struct extent_io_tree *tree, * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, + unsigned long bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; + btrfs_debug_check_extent_io_range(tree, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -532,7 +606,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -555,6 +630,12 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + /* * | ---- desired range ---- | * | state | or @@ -572,19 +653,18 @@ hit_next: */ if (state->start < start) { - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; + state = clear_state_bit(tree, state, &bits, wake); + goto next; } goto search_again; } @@ -595,34 +675,28 @@ hit_next: * on the first half */ if (state->start <= end && state->end > end) { - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - - set |= clear_state_bit(tree, state, bits, wake, delete); + state = clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start <= end && state && !need_resched()) + goto hit_next; goto search_again; out: @@ -630,7 +704,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) @@ -641,8 +715,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -652,7 +726,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -660,11 +733,14 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits) { struct extent_state *state; struct rb_node *node; + btrfs_debug_check_extent_io_range(tree, start, end); + spin_lock(&tree->lock); again: while (1) { @@ -673,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -693,34 +770,27 @@ again: if (start > end) break; - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; } } out: spin_unlock(&tree->lock); - return 0; } -static int set_state_bits(struct extent_io_tree *tree, +static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + unsigned long *bits) { - int ret; - - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; + unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + set_state_cb(tree, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; - - return 0; + state->state |= bits_to_set; } static void cache_state(struct extent_state *state, @@ -745,29 +815,35 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, unsigned long exclusive_bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree, start, end); + + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; + BUG_ON(!prealloc); } spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -776,11 +852,17 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -795,32 +877,22 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; goto out; } - err = set_state_bits(tree, state, bits); - if (err) - goto out; - + set_state_bits(tree, state, &bits); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; - start = last_end + 1; - if (start < end && prealloc && !need_resched()) { - next_node = rb_next(node); - if (next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } - } + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -846,20 +918,27 @@ hit_next: err = -EEXIST; goto out; } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -876,13 +955,19 @@ hit_next: this_end = end; else this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ err = insert_state(tree, prealloc, start, this_end, - bits); - BUG_ON(err == -EEXIST); - if (err) { - prealloc = NULL; - goto out; - } + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -900,16 +985,233 @@ hit_next: err = -EEXIST; goto out; } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); - err = set_state_bits(tree, prealloc, bits); - if (err) { - prealloc = NULL; + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, u64 * failed_start, + struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + +/** + * convert_extent_bit - convert all bits in a given range from one bit to + * another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, unsigned long clear_bits, + struct extent_state **cached_state, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + + btrfs_debug_check_extent_io_range(tree, start, end); + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search_for_insert(tree, start, &p, &parent); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; goto out; } + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); - merge_state(tree, prealloc); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, &clear_bits, 0); prealloc = NULL; goto out; } @@ -936,19 +1238,19 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -957,8 +1259,16 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -966,85 +1276,69 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } -static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, - NULL, mask); -} - int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) + struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, cached_state, mask); } -int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) -{ - return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); -} - /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + unsigned long bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1057,25 +1351,38 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); } -/* - * helper function to set pages and extents in the tree dirty - */ -int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; struct page *page; while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); __set_page_dirty_nobuffers(page); page_cache_release(page); index++; @@ -1094,7 +1401,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1102,21 +1409,17 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) return 0; } -/* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. - * - * If nothing was found, 1 is returned, < 0 on error +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) +static struct extent_state * +find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, unsigned long bits) { struct rb_node *node; struct extent_state *state; - int ret = 1; - spin_lock(&tree->lock); /* * this search will find all the extents that end after * our range starts. @@ -1127,50 +1430,63 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, while (1) { state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } + if (state->end >= start && (state->state & bits)) + return state; + node = rb_next(node); if (!node) break; } out: - spin_unlock(&tree->lock); - return ret; + return NULL; } -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned. If found something, return 0. */ -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits) +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned long bits, + struct extent_state **cached_state) { - struct rb_node *node; struct extent_state *state; + struct rb_node *n; + int ret = 1; - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && state->tree) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } - node = rb_next(node); - if (!node) - break; + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state(state, cached_state); + *start_ret = state->start; + *end_ret = state->end; + ret = 0; } out: - return NULL; + spin_unlock(&tree->lock); + return ret; } /* @@ -1222,20 +1538,20 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *end = state->end; cur_start = state->end + 1; node = rb_next(node); - if (!node) - break; total_bytes += state->end - state->start + 1; if (total_bytes >= max_bytes) break; + if (!node) + break; } out: spin_unlock(&tree->lock); return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1245,7 +1561,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1260,7 +1576,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1331,11 +1646,10 @@ done: * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, - u64 *start, u64 *end, - u64 max_bytes) +STATIC u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) { u64 delalloc_start; u64 delalloc_end; @@ -1354,7 +1668,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return found; + return 0; } /* @@ -1367,10 +1681,9 @@ again: /* * make sure to limit the number of pages we try to lock down - * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, @@ -1380,9 +1693,9 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; + max_bytes = PAGE_CACHE_SIZE; loops = 1; goto again; } else { @@ -1390,11 +1703,10 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1414,34 +1726,21 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op) +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long clear_bits, + unsigned long page_ops) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - int clear_bits = 0; - - if (op & EXTENT_CLEAR_UNLOCK) - clear_bits |= EXTENT_LOCKED; - if (op & EXTENT_CLEAR_DIRTY) - clear_bits |= EXTENT_DIRTY; - - if (op & EXTENT_CLEAR_DELALLOC) - clear_bits |= EXTENT_DELALLOC; - - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | - EXTENT_SET_PRIVATE2))) + if (page_ops == 0) return 0; while (nr_pages > 0) { @@ -1450,20 +1749,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (op & EXTENT_SET_PRIVATE2) + if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (op & EXTENT_CLEAR_DIRTY) + if (page_ops & PAGE_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (op & EXTENT_SET_WRITEBACK) + if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); - if (op & EXTENT_END_WRITEBACK) + if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); - if (op & EXTENT_CLEAR_UNLOCK_PAGE) + if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1481,18 +1780,17 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; - if (search_end <= cur_start) { - WARN_ON(1); + if (WARN_ON(search_end <= cur_start)) return 0; - } spin_lock(&tree->lock); if (cur_start == 0 && bits == EXTENT_DIRTY) { @@ -1511,15 +1809,20 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) break; if (!found) { - *start = state->start; + *start = max(cur_start, state->start); found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -1534,7 +1837,7 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; struct extent_state *state; @@ -1595,14 +1898,15 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached) + unsigned long bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); @@ -1647,43 +1951,429 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; } /* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); - return 0; + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + + kfree(rec); + return err; } /* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchronization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) { - end_page_writeback(page); + struct bio *bio; + struct btrfs_device *dev; + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + int ret; + + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); + BUG_ON(!mirror_num); + + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_iter.bi_size = 0; + map_length = length; + + ret = btrfs_map_block(fs_info, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_iter.bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start - page_offset(page)); + + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); + + bio_put(bio); return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start <= failrec->start && + state->end >= failrec->start + failrec->len - 1) { + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(fs_info, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + ret = 0; + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len <= start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + free_io_failure(inode, failrec, 0); + return -EIO; + } + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_iter.bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_iter.bi_size = 0; + + btrfs_failed_bio = btrfs_io_bio(failed_bio); + if (btrfs_failed_bio->csum) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = btrfs_bio->csum_inline; + phy_offset >>= inode->i_sb->s_blocksize_bits; + phy_offset *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + csum_size); + } + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate) { + ClearPageUptodate(page); + SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -1695,61 +2385,55 @@ static int check_page_writeback(struct extent_io_tree *tree, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; + struct bio_vec *bvec; u64 start; u64 end; - int whole_page; - int ret; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); } - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); - } while (bvec >= bio->bi_io_vec); + end_page_writeback(page); + } bio_put(bio); } +static void +endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, + int uptodate) +{ + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + if (uptodate && tree->track_uptodate) + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -1763,44 +2447,86 @@ static void end_bio_extent_writepage(struct bio *bio, int err) */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; + u64 offset = 0; u64 start; u64 end; - int whole_page; + u64 len; + u64 extent_start = 0; + u64 extent_len = 0; + int mirror; int ret; + int i; if (err) uptodate = 0; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + struct inode *inode = page->mapping->host; + + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, + io_bio->mirror_num); + tree = &BTRFS_I(inode)->io_tree; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; + len = bvec->bv_len; - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end, - NULL); + mirror = io_bio->mirror_num; + if (likely(uptodate && tree->ops && + tree->ops->readpage_end_io_hook)) { + ret = tree->ops->readpage_end_io_hook(io_bio, offset, + page, start, end, + mirror); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + + if (likely(uptodate)) + goto readpage_ok; + + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else { + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -1809,122 +2535,149 @@ static void end_bio_extent_readpage(struct bio *bio, int err) continue; } } - - if (uptodate) { - set_extent_uptodate(tree, start, end, - GFP_ATOMIC); +readpage_ok: + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Zero out the end if this page straddles i_size */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); } - unlock_extent(tree, start, end, GFP_ATOMIC); - - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); + unlock_page(page); + offset += len; + + if (unlikely(!uptodate)) { + if (extent_len) { + endio_readpage_release_extent(tree, + extent_start, + extent_len, 1); + extent_start = 0; + extent_len = 0; } - unlock_page(page); + endio_readpage_release_extent(tree, start, + end - start + 1, 0); + } else if (!extent_len) { + extent_start = start; + extent_len = end + 1 - start; + } else if (extent_start + extent_len == start) { + extent_len += end + 1 - start; } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + endio_readpage_release_extent(tree, extent_start, + extent_len, uptodate); + extent_start = start; + extent_len = end + 1 - start; } - } while (bvec <= bvec_end); + } + if (extent_len) + endio_readpage_release_extent(tree, extent_start, extent_len, + uptodate); + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } /* - * IO done from prepare_write is pretty simple, we just unlock - * the structs in the extent tree when done, and set the uptodate bits - * as appropriate. + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic */ -static void end_bio_extent_preparewrite(struct bio *bio, int err) +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; - u64 start; - u64 end; - - do { - struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); - if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); - } else { - ClearPageUptodate(page); - SetPageError(page); + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); } + } - unlock_extent(tree, start, end, GFP_ATOMIC); - - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; } -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) { - struct bio *bio; + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} - bio = bio_alloc(gfp_flags, nr_vecs); - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; } return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; - u64 end; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags); + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags, start); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); return ret; } +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -1947,18 +2700,17 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (old_compressed) - contig = bio->bi_sector == sector; + contig = bio->bi_iter.bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -1969,7 +2721,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, else nr = bio_get_nr_vecs(bdev); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -1983,6 +2737,18 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } +static void attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); + } +} + void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { @@ -1992,24 +2758,48 @@ void set_page_extent_mapped(struct page *page) } } -static void set_page_extent_head(struct page *page, unsigned long len) +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, get_extent_t *get_extent, + struct extent_map **em_cached) { - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); -} + struct extent_map *em; + if (em_cached && *em_cached) { + em = *em_cached; + if (extent_map_in_tree(em) && start >= em->start && + start < extent_map_end(em)) { + atomic_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = get_extent(inode, page, pg_offset, start, len, 0); + if (em_cached && !IS_ERR_OR_NULL(em)) { + BUG_ON(*em_cached); + atomic_inc(&em->refs); + *em_cached = em; + } + return em; +} /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags) +static int __do_readpage(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -2022,16 +2812,23 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - size_t page_offset = 0; + int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; + unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + if (!PageUptodate(page)) { + if (cleancache_get_page(page) == 0) { + BUG_ON(blocksize != PAGE_SIZE); + unlock_extent(tree, start, end); + goto out; + } + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; @@ -2039,42 +2836,53 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (zero_offset) { iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + if (cur >= last_byte) { char *userpage; - iosize = PAGE_CACHE_SIZE - page_offset; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); + struct extent_state *cached = NULL; + + iosize = PAGE_CACHE_SIZE - pg_offset; + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + if (!parent_locked) + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } - em = get_extent(inode, page, page_offset, cur, - end - cur + 1, 0); - if (IS_ERR(em) || !em) { + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, get_extent, em_cached); + if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - this_bio_flag = EXTENT_BIO_COMPRESSED; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + this_bio_flag |= EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2092,25 +2900,29 @@ static int __extent_read_full_page(struct extent_io_tree *tree, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { char *userpage; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); + struct extent_state *cached = NULL; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; - page_offset += iosize; + pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; - page_offset += iosize; + pg_offset += iosize; continue; } /* we have an inline extent but it didn't get marked up @@ -2118,34 +2930,32 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; - page_offset += iosize; + pg_offset += iosize; continue; } - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, - sector, disk_io_size, page_offset, + pnr -= page->index; + ret = submit_extent_page(rw, tree, page, + sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + if (!ret) { nr++; *bio_flags = this_bio_flag; - } - if (ret) + } else { SetPageError(page); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); + } cur = cur + iosize; - page_offset += iosize; + pg_offset += iosize; } +out: if (!nr) { if (!PageError(page)) SetPageUptodate(page); @@ -2154,17 +2964,129 @@ static int __extent_read_full_page(struct extent_io_tree *tree, return 0; } +static inline void __do_contiguous_readpages(struct extent_io_tree *tree, + struct page *pages[], int nr_pages, + u64 start, u64 end, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode; + struct btrfs_ordered_extent *ordered; + int index; + + inode = pages[0]->mapping->host; + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + for (index = 0; index < nr_pages; index++) { + __do_readpage(tree, pages[index], get_extent, em_cached, bio, + mirror_num, bio_flags, rw); + page_cache_release(pages[index]); + } +} + +static void __extent_readpages(struct extent_io_tree *tree, + struct page *pages[], + int nr_pages, get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + u64 start = 0; + u64 end = 0; + u64 page_start; + int index; + int first_index = 0; + + for (index = 0; index < nr_pages; index++) { + page_start = page_offset(pages[index]); + if (!end) { + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } else if (end + 1 == page_start) { + end += PAGE_CACHE_SIZE; + } else { + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, + bio, mirror_num, bio_flags, + rw); + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } + } + + if (end) + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, bio, + mirror_num, bio_flags, rw); +} + +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + struct btrfs_ordered_extent *ordered; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret; + + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + bio_flags, rw); + return ret; +} + int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, - &bio_flags); + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + &bio_flags, READ); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; + int ret; + + ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } @@ -2179,136 +3101,143 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 delalloc_start; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) +{ + struct extent_io_tree *tree = epd->tree; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; - u64 unlock_start; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC_PLUG; - else - write_flags = WRITE; - - WARN_ON(!PageLocked(page)); - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage, KM_USER0); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -2320,36 +3249,37 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); - unlock_start = page_end + 1; goto done; } blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); - unlock_start = page_end + 1; break; } em = epd->get_extent(inode, page, pg_offset, cur, end - cur + 1, 1); - if (IS_ERR(em) || !em) { + if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = min(em_end - cur, end - cur + 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -2382,14 +3312,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, cur += iosize; pg_offset += iosize; - unlock_start = cur; - continue; - } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; continue; } @@ -2402,14 +3324,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", - page->index, (unsigned long long)cur, - (unsigned long long)end); + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } ret = submit_extent_page(write_flags, tree, page, @@ -2425,20 +3346,391 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: + return 0; +} - /* drop our reference on any cached states */ - free_extent_state(cached_state); +static int eb_wait(void *word) +{ + io_schedule(); return 0; } +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); + ret = 1; + } else { + spin_unlock(&eb->refs_lock); + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + struct bio_vec *bvec; + struct extent_buffer *eb; + int i, done; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } + + bio_put(bio); +} + +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + u64 offset = eb->start; + unsigned long i, num_pages; + unsigned long bio_flags = 0; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int ret = 0; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + continue; + } + + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); + continue; + } + + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + continue; + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -2460,15 +3752,29 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { + struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int range_whole = 0; + int tag; + + /* + * We have to hold onto the inode so that ordered extents can do their + * work when the IO finishes. The alternative to this is failing to add + * an ordered extent if the igrab() fails there and that is a huge pain + * to deal with, so instead just hold onto the inode throughout the + * writepages operation. If it fails here we are freeing up the inode + * anyway and we'd rather not waste our time writing out stuff that is + * going to be truncated anyway. + */ + if (!igrab(inode)) + return 0; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2477,15 +3783,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2499,10 +3808,10 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else + if (!trylock_page(page)) { + flush_fn(data); lock_page(page); + } if (unlikely(page->mapping != mapping)) { unlock_page(page); @@ -2533,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -2546,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -2555,16 +3864,21 @@ retry: index = 0; goto retry; } - return ret; + btrfs_add_delayed_iput(inode); + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -2580,27 +3894,17 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, struct writeback_control *wbc) { int ret; - struct address_space *mapping = page->mapping; struct extent_page_data epd = { .bio = NULL, .tree = tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, - .sync_mode = wbc->sync_mode, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, + .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); - extent_write_cache_pages(tree, mapping, &wbc_writepages, - __extent_writepage, &epd, flush_write_bio); flush_epd_write_bio(&epd); return ret; } @@ -2621,11 +3925,10 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, }; struct writeback_control wbc_writepages = { - .bdi = inode->i_mapping->backing_dev_info, .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, @@ -2662,6 +3965,7 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = extent_write_cache_pages(tree, mapping, wbc, @@ -2679,22 +3983,39 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; unsigned long bio_flags = 0; + struct page *pagepool[16]; + struct page *page; + struct extent_map *em_cached = NULL; + int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { - __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + continue; } - page_cache_release(page); + + pagepool[nr++] = page; + if (nr < ARRAY_SIZE(pagepool)) + continue; + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + nr = 0; } + if (nr) + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + + if (em_cached) + free_extent_map(em_cached); + BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -2707,15 +4028,15 @@ int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -2725,132 +4046,15 @@ int extent_invalidatepage(struct extent_io_tree *tree, } /* - * simple commit_write call, set_range_dirty is used to mark both - * the pages and the extent records as dirty - */ -int extent_commit_write(struct extent_io_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - set_page_extent_mapped(page); - set_page_dirty(page); - - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; -} - -int extent_prepare_write(struct extent_io_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to, get_extent_t *get_extent) -{ - u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - u64 block_start; - u64 orig_block_start; - u64 block_end; - u64 cur_end; - struct extent_map *em; - unsigned blocksize = 1 << inode->i_blkbits; - size_t page_offset = 0; - size_t block_off_start; - size_t block_off_end; - int err = 0; - int iocount = 0; - int ret = 0; - int isnew; - - set_page_extent_mapped(page); - - block_start = (page_start + from) & ~((u64)blocksize - 1); - block_end = (page_start + to - 1) | (blocksize - 1); - orig_block_start = block_start; - - lock_extent(tree, page_start, page_end, GFP_NOFS); - while (block_start <= block_end) { - em = get_extent(inode, page, page_offset, block_start, - block_end - block_start + 1, 1); - if (IS_ERR(em) || !em) - goto err; - - cur_end = min(block_end, extent_map_end(em) - 1); - block_off_start = block_start & (PAGE_CACHE_SIZE - 1); - block_off_end = block_off_start + blocksize; - isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); - - if (!PageUptodate(page) && isnew && - (block_off_end > to || block_off_start < from)) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_off_end > to) - memset(kaddr + to, 0, block_off_end - to); - if (block_off_start < from) - memset(kaddr + block_off_start, 0, - from - block_off_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } - if ((em->block_start != EXTENT_MAP_HOLE && - em->block_start != EXTENT_MAP_INLINE) && - !isnew && !PageUptodate(page) && - (block_off_end > to || block_off_start < from) && - !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - u64 sector; - u64 extent_offset = block_start - em->start; - size_t iosize; - sector = (em->block_start + extent_offset) >> 9; - iosize = (cur_end - block_start + blocksize) & - ~((u64)blocksize - 1); - /* - * we've already got the extent locked, but we - * need to split the state such that our end_bio - * handler can clear the lock. - */ - set_extent_bit(tree, block_start, - block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, em->bdev, - NULL, 1, - end_bio_extent_preparewrite, 0, - 0, 0); - iocount++; - block_start = block_start + iosize; - } else { - set_extent_uptodate(tree, block_start, cur_end, - GFP_NOFS); - unlock_extent(tree, block_start, cur_end, GFP_NOFS); - block_start = cur_end + 1; - } - page_offset = block_start & (PAGE_CACHE_SIZE - 1); - free_extent_map(em); - } - if (iocount) { - wait_extent_bit(tree, orig_block_start, - block_end, EXTENT_LOCKED); - } - check_page_uptodate(tree, page); -err: - /* FIXME, zero out newly allocated blocks on error */ - return err; -} - -/* * a helper for releasepage, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +static int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, + struct page *page, gfp_t mask) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; int ret = 1; @@ -2864,9 +4068,17 @@ int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - clear_extent_bit(tree, start, end, + ret = clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; } return ret; } @@ -2881,7 +4093,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, gfp_t mask) { struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if ((mask & __GFP_WAIT) && @@ -2891,7 +4103,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (!em || IS_ERR(em)) { + if (!em) { write_unlock(&map->lock); break; } @@ -2919,76 +4131,183 @@ int try_release_extent_mapping(struct extent_map_tree *map, return try_release_extent_state(map, tree, page, mask); } -sector_t extent_bmap(struct address_space *mapping, sector_t iblock, - get_extent_t *get_extent) +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) { - struct inode *inode = mapping->host; - struct extent_state *cached_state = NULL; - u64 start = iblock << inode->i_blkbits; - sector_t sector = 0; - size_t blksize = (1 << inode->i_blkbits); + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; struct extent_map *em; + u64 len; - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, - 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, start, blksize, 0); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, - start + blksize - 1, &cached_state, GFP_NOFS); - if (!em || IS_ERR(em)) - return 0; + if (offset >= last) + return NULL; - if (em->block_start > EXTENT_MAP_LAST_BYTE) - goto out; + while (1) { + len = last - offset; + if (len == 0) + break; + len = ALIGN(len, sectorsize); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (IS_ERR_OR_NULL(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } - sector = (em->block_start + start - em->start) >> inode->i_blkbits; -out: - free_extent_map(em); - return sector; + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + +static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) +{ + unsigned long cnt = *((unsigned long *)ctx); + + cnt++; + *((unsigned long *)ctx) = cnt; + + /* Now we're sure that the extent is shared. */ + if (cnt > 1) + return 1; + return 0; } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { - int ret; + int ret = 0; u64 off = start; u64 max = start + len; u32 flags = 0; + u32 found_type; + u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct btrfs_path *path; int end = 0; - u64 em_start = 0, em_len = 0; - unsigned long emflags; - ret = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; if (len == 0) return -EINVAL; - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, btrfs_ino(inode), -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, but there might be delalloc bits */ + if (found_key.objectid != btrfs_ino(inode) || + found_type != BTRFS_EXTENT_DATA_KEY) { + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; + } + btrfs_release_path(path); + + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + &cached_state); + + em = get_extent_skip_holes(inode, start, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } + while (!end) { - off = em->start + em->len; - if (off >= max) - end = 1; + u64 offset_in_extent = 0; + + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; - em_start = em->start; - em_len = em->len; + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + /* + * record the offset from the start of the extent + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. + */ + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_HOLE) { - flags |= FIEMAP_EXTENT_UNWRITTEN; } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); @@ -2996,115 +4315,349 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + unsigned long ref_cnt = 0; + + disko = em->block_start + offset_in_extent; + + /* + * As btrfs supports shared space, this information + * can be exported to userspace tools via + * flag FIEMAP_EXTENT_SHARED. + */ + ret = iterate_inodes_from_logical( + em->block_start, + BTRFS_I(inode)->root->fs_info, + path, count_ext_ref, &ref_cnt); + if (ret < 0 && ret != -ENOENT) + goto out_free; + + if (ref_cnt > 1) + flags |= FIEMAP_EXTENT_SHARED; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; - emflags = em->flags; free_extent_map(em); em = NULL; + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; } - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + em_len, flags); if (ret) goto out_free; } out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + btrfs_free_path(path); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) +static void __free_extent_buffer(struct extent_buffer *eb) { - struct page *p; - struct address_space *mapping; + btrfs_leak_debug_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); +} - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; +int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + unsigned long num_pages; + struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + BUG_ON(extent_buffer_under_io(eb)); - return p; + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page && mapped) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + } + if (page) { + /* One for when we alloced the page */ + page_cache_release(page); + } + } while (index != start_idx); } -static inline unsigned long num_extent_pages(u64 start, u64 len) +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); } -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len, gfp_t mask) { struct extent_buffer *eb = NULL; -#if LEAK_DEBUG - unsigned long flags; -#endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); - -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&eb->leak_list, &buffers); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + eb->fs_info = fs_info; + eb->bflags = 0; + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + eb->lock_nested = 0; + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); + + btrfs_leak_debug_add(&eb->leak_list, &buffers); + + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_NOFS); + if (!p) { + btrfs_release_extent_buffer(new); + return NULL; + } + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ + struct extent_buffer *eb; + unsigned long num_pages = num_extent_pages(0, len); + unsigned long i; + + eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_NOFS); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (; i > 0; i--) + __free_page(eb->pages[i - 1]); + __free_extent_buffer(eb); + return NULL; } -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0, - gfp_t mask) +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + int refs; + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + + spin_lock(&eb->refs_lock); + if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_inc(&eb->refs); + spin_unlock(&eb->refs_lock); +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + if (p != accessed) + mark_page_accessed(p); + } +} + +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3112,110 +4665,198 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; - struct address_space *mapping = tree->mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; int uptodate = 1; + int ret; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) { - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); - mark_page_accessed(eb->first_page); + eb = find_extent_buffer(fs_info, start); + if (eb) return eb; - } - spin_unlock(&tree->buffer_lock); - eb = __alloc_extent_buffer(tree, start, len, mask); + eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); - if (!p) { - WARN_ON(1); + for (i = 0; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, GFP_NOFS); + if (!p) goto free_eb; + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + page_cache_release(p); + mark_extent_buffer_accessed(exists, p); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; - unlock_page(p); + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - spin_lock(&tree->buffer_lock); - exists = buffer_tree_insert(tree, start, &eb->rb_node); - if (exists) { - /* add one reference for the caller */ - atomic_inc(&exists->refs); - spin_unlock(&tree->buffer_lock); +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) goto free_eb; + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; } /* add one reference for the tree */ - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (!atomic_dec_and_test(&eb->refs)) - return exists; - for (index = 1; index < i; index++) - page_cache_release(extent_buffer_page(eb, index)); - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } + + WARN_ON(!atomic_dec_and_test(&eb->refs)); + btrfs_release_extent_buffer(eb); return exists; } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - gfp_t mask) +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) { - struct extent_buffer *eb; + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); + __free_extent_buffer(eb); +} - if (eb) - mark_page_accessed(eb->first_page); +/* Expects to have eb->eb_lock already held */ +static int release_extent_buffer(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; - return eb; + spin_unlock(&eb->refs_lock); + + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); + } + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return 1; + } + spin_unlock(&eb->refs_lock); + + return 0; } void free_extent_buffer(struct extent_buffer *eb) { + int refs; + int old; if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3229,10 +4870,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, continue; lock_page(page); - if (i == 0) - set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); + WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); @@ -3242,45 +4880,39 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } - return 0; -} - -int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, - struct extent_buffer *eb) -{ - return wait_on_extent_writeback(tree, eb->start, - eb->start + eb->len - 1); + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3289,88 +4921,28 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; } -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end) +int extent_buffer_uptodate(struct extent_buffer *eb) { - struct page *page; - int ret; - int pg_uptodate = 1; - int uptodate; - unsigned long index; - - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - page = find_get_page(tree->mapping, index); - uptodate = PageUptodate(page); - page_cache_release(page); - if (!uptodate) { - pg_uptodate = 0; - break; - } - start += PAGE_CACHE_SIZE; - } - return pg_uptodate; -} - -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) -{ - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3380,19 +4952,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -3404,15 +4971,17 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -3420,17 +4989,17 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num, &bio_flags); + mirror_num, &bio_flags, + READ | REQ_META); if (err) ret = err; } else { @@ -3438,10 +5007,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ | REQ_META, bio, mirror_num, + bio_flags); + if (err) + return err; + } - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { @@ -3451,8 +5024,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -3481,15 +5052,14 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3498,10 +5068,47 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3523,49 +5130,19 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " - "wanted %lu %lu\n", (unsigned long long)eb->start, - eb->len, start, min_len); - WARN_ON(1); + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", + eb->start, eb->len, start, min_len); + return -EINVAL; } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3582,16 +5159,15 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3617,16 +5193,15 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3648,16 +5223,15 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3680,7 +5254,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(dst, i); @@ -3688,9 +5262,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3699,42 +5272,32 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); - } - kunmap_atomic(dst_kaddr, KM_USER0); + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; } static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; - if (dst_page != src_page) - src_kaddr = kmap_atomic(src_page, KM_USER1); - else + if (dst_page != src_page) { + src_kaddr = page_address(src_page); + } else { src_kaddr = dst_kaddr; + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; + } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -3748,21 +5311,21 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; @@ -3795,12 +5358,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } @@ -3813,13 +5376,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), + copy_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); @@ -3830,34 +5393,44 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +int try_release_extent_buffer(struct page *page) { - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - unsigned long i; - unsigned long num_pages; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (!eb) - goto out; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; + } - if (atomic_read(&eb->refs) > 1) { - ret = 0; - goto out; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + + /* + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; + spin_unlock(&page->mapping->private_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; } - /* at this point we can safely release the extent buffer */ - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) - page_cache_release(extent_buffer_page(eb, i)); - rb_erase(&eb->rb_node, &tree->buffer); - __free_extent_buffer(eb); -out: - spin_unlock(&tree->buffer_lock); - return ret; + + return release_extent_buffer(eb); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bbab4813646..ccc264e7bde 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,25 +16,41 @@ #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) +#define EXTENT_DAMAGED (1 << 14) +#define EXTENT_NORESERVE (1 << 15) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) -/* flags for bio submission */ +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_TREE_LOG 2 +#define EXTENT_BIO_PARENT_LOCKED 4 +#define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +#define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_IN_TREE 10 /* these are flags for extent_clear_unlock_delalloc */ -#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -#define EXTENT_CLEAR_UNLOCK 0x2 -#define EXTENT_CLEAR_DELALLOC 0x4 -#define EXTENT_CLEAR_DIRTY 0x8 -#define EXTENT_SET_WRITEBACK 0x10 -#define EXTENT_END_WRITEBACK 0x20 -#define EXTENT_SET_PRIVATE2 0x40 -#define EXTENT_CLEAR_ACCOUNTING 0x80 +#define PAGE_UNLOCK (1 << 0) +#define PAGE_CLEAR_DIRTY (1 << 1) +#define PAGE_SET_WRITEBACK (1 << 2) +#define PAGE_END_WRITEBACK (1 << 3) +#define PAGE_SET_PRIVATE2 (1 << 4) /* * page->private values. Every page that is controlled by the extent @@ -44,10 +60,12 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; +struct btrfs_root; +struct btrfs_io_bio; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags); + unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -55,39 +73,32 @@ struct extent_io_ops { int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(struct page *page, unsigned long offset, + int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); - int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); - int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long bits); - int (*merge_extent_hook)(struct inode *inode, - struct extent_state *new, - struct extent_state *other); - int (*split_extent_hook)(struct inode *inode, - struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long *bits); + void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long *bits); + void (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + void (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); }; struct extent_io_tree { struct rb_root state; - struct rb_root buffer; struct address_space *mapping; u64 dirty_bytes; + int track_uptodate; spinlock_t lock; - spinlock_t buffer_lock; struct extent_io_ops *ops; }; @@ -101,110 +112,135 @@ struct extent_state { wait_queue_head_t wq; atomic_t refs; unsigned long state; - u64 split_start; - u64 split_end; /* for use by the FS */ u64 private; +#ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; +#endif }; +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; - char *map_token; - char *kaddr; unsigned long map_start; unsigned long map_len; - struct page *first_page; unsigned long bflags; + struct btrfs_fs_info *fs_info; + spinlock_t refs_lock; atomic_t refs; - struct list_head leak_list; - struct rb_node rb_node; + atomic_t io_pages; + int read_mirror; + struct rcu_head rcu_head; + pid_t lock_owner; - /* the spinlock is used to protect most operations */ - spinlock_t lock; + /* count of read lock holders on the extent buffer */ + atomic_t write_locks; + atomic_t read_locks; + atomic_t blocking_writers; + atomic_t blocking_readers; + atomic_t spinning_readers; + atomic_t spinning_writers; + int lock_nested; - /* - * when we keep the lock held while blocking, waiters go onto - * the wq + /* protects write locks */ + rwlock_t lock; + + /* readers use lock_wq while they wait for the write + * lock holders to unlock */ - wait_queue_head_t lock_wq; + wait_queue_head_t write_lock_wq; + + /* writers use read_lock_wq while they wait for readers + * to unlock + */ + wait_queue_head_t read_lock_wq; + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; -struct extent_map_tree; +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} -static inline struct extent_state *extent_state_next(struct extent_state *state) +static inline int extent_compress_type(unsigned long bio_flags) { - struct rb_node *node; - node = rb_next(&state->rb_node); - if (!node) - return NULL; - return rb_entry(node, struct extent_state, rb_node); + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; } +struct extent_map_tree; + typedef struct extent_map *(get_extent_t)(struct inode *inode, struct page *page, - size_t page_offset, + size_t pg_offset, u64 start, u64 len, int create); void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping, gfp_t mask); + struct address_space *mapping); int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int try_release_extent_buffer(struct page *page); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached, gfp_t mask); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + unsigned long bits, struct extent_state **cached); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached_state); + unsigned long bits, int filled, + struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, struct extent_state **cached, - gfp_t mask); + unsigned long bits, int wake, int delete, + struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, unsigned long clear_bits, + struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits); -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits); + u64 *start_ret, u64 *end_ret, unsigned long bits, + struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct extent_io_tree *tree, struct page *page, @@ -217,36 +253,44 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_extent_t get_extent); -int extent_prepare_write(struct extent_io_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to, get_extent_t *get_extent); -int extent_commit_write(struct extent_io_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to); -sector_t extent_bmap(struct address_space *mapping, sector_t iblock, - get_extent_t *get_extent); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); -int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0, - gfp_t mask); -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - gfp_t mask); +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +void wait_on_extent_buffer_writeback(struct extent_buffer *eb); + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + return eb->pages[i]; +} static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -259,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, + unsigned long start, + unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, @@ -270,38 +317,42 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, - struct extent_buffer *eb); -int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int test_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state); -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); +void clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); -int release_extent_buffer_tail_pages(struct extent_buffer *eb); -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op); + unsigned long *map_len); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long bits_to_clear, + unsigned long page_ops); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); + +struct btrfs_fs_info; + +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); +#endif #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 454ca52d645..225302b39af 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,8 +1,8 @@ #include <linux/err.h> #include <linux/slab.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> +#include "ctree.h" #include "extent_map.h" @@ -10,7 +10,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", + extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) @@ -27,34 +27,36 @@ void extent_map_exit(void) /** * extent_map_tree_init - initialize extent map tree * @tree: tree to initialize - * @mask: flags for memory allocations during tree operations * * Initialize the extent tree @tree. Should be called for each new inode * or other user of the extent_map interface. */ -void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) +void extent_map_tree_init(struct extent_map_tree *tree) { tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } /** * alloc_extent_map - allocate new extent map structure - * @mask: memory allocation flags * * Allocate a new extent_map structure. The new structure is * returned with a reference count of one and needs to be * freed using free_extent_map() */ -struct extent_map *alloc_extent_map(gfp_t mask) +struct extent_map *alloc_extent_map(void) { struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, mask); - if (!em || IS_ERR(em)) - return em; - em->in_tree = 0; + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); + if (!em) + return NULL; + RB_CLEAR_NODE(&em->rb_node); em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); return em; } @@ -71,37 +73,64 @@ void free_extent_map(struct extent_map *em) return; WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { - WARN_ON(em->in_tree); + WARN_ON(extent_map_in_tree(em)); + WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->bdev); kmem_cache_free(extent_map_cache, em); } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - struct extent_map *entry; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - WARN_ON(!entry->in_tree); - - if (offset < entry->start) + if (em->start < entry->start) p = &(*p)->rb_left; - else if (offset >= extent_map_end(entry)) + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; else - return parent; + return -EEXIST; } - entry = rb_entry(node, struct extent_map, rb_node); - entry->in_tree = 1; - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color(&em->rb_node, root); + return 0; } /* @@ -123,8 +152,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, prev = n; prev_entry = entry; - WARN_ON(!entry->in_tree); - if (offset < entry->start) n = n->rb_left; else if (offset >= extent_map_end(entry)) @@ -167,6 +194,18 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + + /* + * We don't want to merge stuff that hasn't been written to the log yet + * since it may not reflect exactly what is on disk, and that would be + * bad. + */ + if (!list_empty(&prev->list) || !list_empty(&next->list)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -183,22 +222,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; struct extent_map *merge = NULL; struct rb_node *rb; - struct extent_map *em; - - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); - - WARN_ON(!em || em->start != start); - - if (!em) - goto out; - - clear_bit(EXTENT_FLAG_PINNED, &em->flags); if (em->start != 0) { rb = rb_prev(&em->rb_node); @@ -206,11 +233,16 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(merge, em)) { em->start = merge->start; + em->orig_start = merge->orig_start; em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - merge->in_tree = 0; + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } } @@ -220,11 +252,59 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->len; + em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; + RB_CLEAR_NODE(&merge->rb_node); + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); free_extent_map(merge); } +} + +/** + * unpin_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) +{ + int ret = 0; + struct extent_map *em; + bool prealloc = false; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); + em->generation = gen; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_FILLING, &em->flags); + } + + try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } free_extent_map(em); out: @@ -233,6 +313,27 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in @@ -241,62 +342,49 @@ out: * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successfull. + * reference dropped if the merge attempt was successful. */ int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em) + struct extent_map *em, int modified) { int ret = 0; - struct extent_map *merge = NULL; - struct rb_node *rb; - struct extent_map *exist; - exist = lookup_extent_mapping(tree, em->start, em->len); - if (exist) { - free_extent_map(exist); - ret = -EEXIST; + ret = tree_insert(&tree->map, em); + if (ret) goto out; - } - rb = tree_insert(&tree->map, em->start, &em->rb_node); - if (rb) { - ret = -EEXIST; - goto out; - } - atomic_inc(&em->refs); - if (em->start != 0) { - rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { - em->start = merge->start; - em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - merge->in_tree = 0; - rb_erase(&merge->rb_node, &tree->map); - free_extent_map(merge); - } - } - rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { - em->len += merge->len; - em->block_len += merge->len; - rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; - free_extent_map(merge); - } + + setup_extent_mapping(tree, em, modified); out: return ret; } -/* simple helper to do math around the end of an extent, handling wrap */ -static u64 range_end(u64 start, u64 len) +static struct extent_map * +__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { - if (start + len < start) - return (u64)-1; - return start + len; + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node) { + if (prev) + rb_node = prev; + else if (next) + rb_node = next; + else + return NULL; + } + + em = rb_entry(rb_node, struct extent_map, rb_node); + + if (strict && !(end > em->start && start < extent_map_end(em))) + return NULL; + + atomic_inc(&em->refs); + return em; } /** @@ -313,42 +401,7 @@ static u64 range_end(u64 start, u64 len) struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - struct extent_map *em; - struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; - u64 end = range_end(start, len); - - rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } - if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); - goto out; - } - em = rb_entry(rb_node, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - - em = NULL; - goto out; - -found: - atomic_inc(&em->refs); -out: - return em; + return __lookup_extent_mapping(tree, start, len, 1); } /** @@ -365,38 +418,7 @@ out: struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - struct extent_map *em; - struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; - - rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - goto found; - } - if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); - goto out; - } - em = rb_entry(rb_node, struct extent_map, rb_node); - goto found; - - em = NULL; - goto out; - -found: - atomic_inc(&em->refs); -out: - return em; + return __lookup_extent_mapping(tree, start, len, 0); } /** @@ -413,6 +435,23 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); - em->in_tree = 0; + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); + RB_CLEAR_NODE(&em->rb_node); return ret; } + +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ab6d74b6e64..b2991fd8583 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,16 +3,19 @@ #include <linux/rbtree.h> -#define EXTENT_MAP_LAST_BYTE (u64)-4 -#define EXTENT_MAP_HOLE (u64)-3 -#define EXTENT_MAP_INLINE (u64)-2 -#define EXTENT_MAP_DELALLOC (u64)-1 +#define EXTENT_MAP_LAST_BYTE ((u64)-4) +#define EXTENT_MAP_HOLE ((u64)-3) +#define EXTENT_MAP_INLINE ((u64)-2) +#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ #define EXTENT_FLAG_COMPRESSED 1 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ +#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ struct extent_map { struct rb_node rb_node; @@ -20,20 +23,32 @@ struct extent_map { /* all of these are in bytes */ u64 start; u64 len; + u64 mod_start; + u64 mod_len; u64 orig_start; + u64 orig_block_len; + u64 ram_bytes; u64 block_start; u64 block_len; + u64 generation; unsigned long flags; struct block_device *bdev; atomic_t refs; - int in_tree; + unsigned int compress_type; + struct list_head list; }; struct extent_map_tree { struct rb_root map; + struct list_head modified_extents; rwlock_t lock; }; +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + static inline u64 extent_map_end(struct extent_map *em) { if (em->start + em->len < em->start) @@ -48,18 +63,23 @@ static inline u64 extent_map_block_end(struct extent_map *em) return em->block_start + em->block_len; } -void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); +void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em); + struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified); -struct extent_map *alloc_extent_map(gfp_t mask); +struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54a255065aa..f46cfe45d68 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,16 +23,19 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "volumes.h" #include "print-tree.h" -#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ + PAGE_CACHE_SIZE)) + #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)->sectorsize - (r)->sectorsize) + sizeof(u32) * (r)->sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -48,7 +51,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); @@ -58,7 +62,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); + BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -79,10 +83,11 @@ out: return ret; } -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow) +static struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) { int ret; struct btrfs_key file_key; @@ -90,8 +95,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -115,9 +119,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -130,7 +136,6 @@ fail: return ERR_PTR(ret); } - int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -148,36 +153,79 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) +{ + kfree(bio->csum_allocated); +} -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { - u32 sum; struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; - u64 offset; + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; + u8 *csum; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); - int ret; - struct btrfs_path *path; - struct btrfs_csum_item *item = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int nblocks; + int bio_index = 0; + int count; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); - if (bio->bi_size > PAGE_CACHE_SIZE * 8) + if (!path) + return -ENOMEM; + + nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + if (!dst) { + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, + GFP_NOFS); + if (!btrfs_bio->csum_allocated) { + btrfs_free_path(path); + return -ENOMEM; + } + btrfs_bio->csum = btrfs_bio->csum_allocated; + btrfs_bio->end_io = btrfs_io_bio_endio_readpage; + } else { + btrfs_bio->csum = btrfs_bio->csum_inline; + } + csum = btrfs_bio->csum; + } else { + csum = (u8 *)dst; + } + + if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; WARN_ON(bio->bi_vcnt <= 0); - disk_bytenr = (u64)bio->bi_sector << 9; + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + + disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); - if (ret == 0) + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, + (u32 *)csum, nblocks); + if (count) goto found; if (!item || disk_bytenr < item_start_offset || @@ -186,27 +234,24 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, u32 item_size; if (item) - btrfs_release_path(root, path); + btrfs_release_path(path); item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; + count = 1; + memset(csum, 0, csum_size); if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + bvec->bv_len - 1, EXTENT_NODATASUM, GFP_NOFS); } else { - printk(KERN_INFO "btrfs no csum found " - "for inode %lu start %llu\n", - inode->i_ino, - (unsigned long long)offset); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "no csum found for inode %llu start %llu", + btrfs_ino(inode), offset); } item = NULL; - btrfs_release_path(root, path); + btrfs_release_path(path); goto found; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, @@ -228,40 +273,74 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, diff = disk_bytenr - item_start_offset; diff = diff / root->sectorsize; diff = diff * csum_size; - - read_extent_buffer(path->nodes[0], &sum, + count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], csum, ((unsigned long)item) + diff, - csum_size); + csum_size * count); found: - if (dst) - *dst++ = sum; - else - set_state_private(io_tree, offset, sum); - disk_bytenr += bvec->bv_len; - bio_index++; - bvec++; + csum += count * csum_size; + nblocks -= count; + bio_index += count; + while (count--) { + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } } btrfs_free_path(path); return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct btrfs_dio_private *dip, struct bio *bio, + u64 offset) +{ + int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int ret; + + len >>= inode->i_sb->s_blocksize_bits; + len *= csum_size; + + ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, + (u32 *)(dip->csum + len), 1); + return ret; +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list) + struct list_head *list, int search_commit) { struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; + LIST_HEAD(tmplist); unsigned long offset; int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + ASSERT(start == ALIGN(start, root->sectorsize) && + (end + 1) == ALIGN(end + 1, root->sectorsize)); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + + if (search_commit) { + path->skip_locking = 1; + path->reada = 2; + path->search_commit_root = 1; + } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = start; @@ -296,11 +375,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - key.type != BTRFS_EXTENT_CSUM_KEY) - break; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.offset > end) + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) break; if (key.offset > start) @@ -318,37 +394,41 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); + MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); - BUG_ON(!sums); + GFP_NOFS); + if (!sums) { + ret = -ENOMEM; + goto fail; + } - sector_sum = sums->sums; sums->bytenr = start; - sums->len = size; + sums->len = (int)size; offset = (start - key.offset) >> root->fs_info->sb->s_blocksize_bits; offset *= csum_size; + size >>= root->fs_info->sb->s_blocksize_bits; - while (size > 0) { - read_extent_buffer(path->nodes[0], - §or_sum->sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum->bytenr = start; - - size -= root->sectorsize; - start += root->sectorsize; - offset += csum_size; - sector_sum++; - } - list_add_tail(&sums->list, list); + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root->sectorsize * size; + list_add_tail(&sums->list, &tmplist); } path->slots[0]++; } ret = 0; fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + btrfs_free_path(path); return ret; } @@ -357,24 +437,22 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums->sums; - disk_bytenr = (u64)bio->bi_sector << 9; - sums->len = bio->bi_size; + sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); if (contig) @@ -383,49 +461,48 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, offset = page_offset(bvec->bv_page) + bvec->bv_offset; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); - sums->bytenr = ordered->start; + BUG_ON(!ordered); /* Logic error */ + sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; + index = 0; while (bio_index < bio->bi_vcnt) { if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; btrfs_add_ordered_sum(inode, ordered, sums); btrfs_put_ordered_extent(ordered); - bytes_left = bio->bi_size - total_bytes; + bytes_left = bio->bi_iter.bi_size - total_bytes; sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); - BUG_ON(!sums); - sector_sum = sums->sums; + BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); - sums->bytenr = ordered->start; + BUG_ON(!ordered); /* Logic error */ + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + + total_bytes; + index = 0; } - data = kmap_atomic(bvec->bv_page, KM_USER0); - sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(root, - data + bvec->bv_offset, - sector_sum->sum, - bvec->bv_len); - kunmap_atomic(data, KM_USER0); - btrfs_csum_final(sector_sum->sum, - (char *)§or_sum->sum); - sector_sum->bytenr = disk_bytenr; - - sector_sum++; + data = kmap_atomic(bvec->bv_page); + sums->sums[index] = ~(u32)0; + sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, + sums->sums[index], + bvec->bv_len); + kunmap_atomic(data); + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); + bio_index++; + index++; total_bytes += bvec->bv_len; this_sum_bytes += bvec->bv_len; - disk_bytenr += bvec->bv_len; offset += bvec->bv_len; bvec++; } @@ -446,19 +523,16 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 bytenr, u64 len) +static noinline void truncate_one_csum(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; - int ret; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; @@ -474,8 +548,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 1); - BUG_ON(ret); + btrfs_truncate_item(root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -487,16 +560,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 0); - BUG_ON(ret); + btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - ret = btrfs_set_item_key_safe(trans, root, path, key); - BUG_ON(ret); + btrfs_set_item_key_safe(root, path, key); } else { BUG(); } - return 0; } /* @@ -512,13 +582,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -529,9 +600,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) - goto out; + break; path->slots[0]--; + } else if (ret < 0) { + break; } + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); @@ -554,7 +628,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, /* delete the entire item, it is inside our range */ if (key.offset >= bytenr && csum_end <= end_byte) { ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); + if (ret) + goto out; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { @@ -596,67 +671,71 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - BUG_ON(ret && ret != -EAGAIN); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } key.offset = end_byte - 1; } else { - ret = truncate_one_csum(trans, root, path, - &key, bytenr, len); - BUG_ON(ret); + truncate_one_csum(root, path, &key, bytenr, len); if (key.offset < bytenr) break; } - btrfs_release_path(root, path); + btrfs_release_path(path); } + ret = 0; out: btrfs_free_path(path); - return 0; + return ret; } int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { - u64 bytenr; - int ret; struct btrfs_key file_key; struct btrfs_key found_key; - u64 next_offset; - u64 total_bytes = 0; - int found_next; struct btrfs_path *path; struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; u64 csum_offset; - struct btrfs_sector_sum *sector_sum; + u64 bytenr; u32 nritems; u32 ins_size; - char *eb_map; - char *eb_token; - unsigned long map_len; - unsigned long map_start; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + int index = 0; + int found_next; + int ret; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); - sector_sum = sums->sums; + if (!path) + return -ENOMEM; again: next_offset = (u64)-1; found_next = 0; + bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum->bytenr; - bytenr = sector_sum->bytenr; + file_key.offset = bytenr; btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path->nodes[0]; ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ @@ -671,7 +750,7 @@ again: int slot = path->slots[0] + 1; /* we didn't find a csum item, insert one */ nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { + if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); if (ret == 1) found_next = 1; @@ -694,7 +773,7 @@ again: * at this point, we know the tree has an item, but it isn't big * enough yet to put our csum in. Grow it */ - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); if (ret < 0) @@ -717,43 +796,48 @@ again: goto insert; } - if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / csum_size) { - u32 diff = (csum_offset + 1) * csum_size; + int extend_nr; + u64 tmp; + u32 diff; + u32 free_space; - /* - * is the item big enough already? we dropped our lock - * before and need to recheck - */ - if (diff < btrfs_item_size_nr(leaf, path->slots[0])) - goto csum; + if (btrfs_leaf_free_space(root, leaf) < + sizeof(struct btrfs_item) + csum_size * 2) + goto insert; + + free_space = btrfs_leaf_free_space(root, leaf) - + sizeof(struct btrfs_item) - csum_size; + tmp = sums->len - total_bytes; + tmp >>= root->fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); + + extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - if (diff != csum_size) - goto insert; + diff = min(free_space, diff); + diff /= csum_size; + diff *= csum_size; - ret = btrfs_extend_item(trans, root, path, diff); - BUG_ON(ret); + btrfs_extend_item(root, path, diff); + ret = 0; goto csum; } insert: - btrfs_release_path(root, path); + btrfs_release_path(path); csum_offset = 0; if (found_next) { - u64 tmp = total_bytes + root->sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; + u64 tmp; - while (tmp < sums->len) { - if (next_sector + root->sectorsize != next->bytenr) - break; - tmp += root->sectorsize; - next_sector = next->bytenr; - next++; - } - tmp = min(tmp, next_offset - file_key.offset); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + root->fs_info->sb->s_blocksize_bits); + tmp = max((u64)1, tmp); tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); ins_size = csum_size * tmp; @@ -766,63 +850,31 @@ insert: path->leave_spinning = 0; if (ret < 0) goto fail_unlock; - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto fail_unlock; - } -csum: leaf = path->nodes[0]; +csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - ret = 0; + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size_nr(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); - eb_token = NULL; -next_sector: - - if (!eb_token || - (unsigned long)item + csum_size >= map_start + map_len) { - int err; - - if (eb_token) - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - err = map_private_extent_buffer(leaf, (unsigned long)item, - csum_size, - &eb_token, &eb_map, - &map_start, &map_len, KM_USER1); - if (err) - eb_token = NULL; - } - if (eb_token) { - memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), - §or_sum->sum, csum_size); - } else { - write_extent_buffer(leaf, §or_sum->sum, - (unsigned long)item, csum_size); - } + ins_size = (u32)(sums->len - total_bytes) >> + root->fs_info->sb->s_blocksize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + ins_size /= csum_size; + total_bytes += ins_size * root->sectorsize; + index += ins_size; - total_bytes += root->sectorsize; - sector_sum++; - if (total_bytes < sums->len) { - item = (struct btrfs_csum_item *)((char *)item + - csum_size); - if (item < item_end && bytenr + PAGE_CACHE_SIZE == - sector_sum->bytenr) { - bytenr = sector_sum->bytenr; - goto next_sector; - } - } - if (eb_token) { - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - } btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { - btrfs_release_path(root, path); + btrfs_release_path(path); cond_resched(); goto again; } @@ -833,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749ff4c..1f2b99cb55e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,72 +24,459 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> +#include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> #include <linux/slab.h> +#include <linux/btrfs.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "compat.h" +#include "volumes.h" +#include "qgroup.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; +/* + * when auto defrag is enabled we + * queue up these defrag structs to remember which + * inodes need defragging passes + */ +struct inode_defrag { + struct rb_node rb_node; + /* objectid */ + u64 ino; + /* + * transid where the defrag was added, we search for + * extents newer than this + */ + u64 transid; + + /* root objectid */ + u64 root; + + /* last offset we were able to defrag */ + u64 last_offset; + + /* if we've wrapped around back to zero once already */ + int cycled; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* pop a record for an inode into the defrag tree. The lock + * must be held already + * + * If you're inserting a record for an older transid than an + * existing record, the transid already in the tree is lowered + * + * If an existing record is found the defrag item you + * pass in is freed + */ +static int __btrfs_add_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &root->fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* if we're reinserting an entry for + * an old defrag run, make sure to + * lower the transid of our existing record + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + if (defrag->last_offset > entry->last_offset) + entry->last_offset = defrag->last_offset; + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + + return 1; +} + +/* + * insert a defrag record for this inode if auto defrag is + * enabled + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(root)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = BTRFS_I(inode)->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + + spin_lock(&root->fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&root->fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +static void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + +/* + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. + */ +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + if (need_resched()) { + spin_unlock(&fs_info->defrag_inodes_lock); + cond_resched(); + spin_lock(&fs_info->defrag_inodes_lock); + } + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_key key; + struct btrfs_ioctl_defrag_range_args range; + int num_defrag; + int index; + int ret; + + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info->tree_root)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * during unmount, we use the transaction_wait queue to + * wait for the defragger to stop + */ + wake_up(&fs_info->transaction_wait); + return 0; +} /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - int write_bytes, + size_t write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied = 0; + size_t total_copied = 0; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; + /* + * Copy data from userspace to the current page + */ + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; - if (page_fault) + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + + iov_iter_advance(i, copied); + write_bytes -= copied; + total_copied += copied; + + /* Return to btrfs_file_write_iter to fault page */ + if (unlikely(copied == 0)) break; + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return total_copied; } /* * unlocks pages after btrfs_file_write is done with them */ -static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { - if (!pages[i]) - break; /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -102,17 +489,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - size_t write_bytes) +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) { int err = 0; int i; - struct inode *inode = fdentry(file)->d_inode; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -120,12 +503,11 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - NULL); + cached); if (err) return err; @@ -135,32 +517,35 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, ClearPageChecked(p); set_page_dirty(p); } - if (end_pos > isize) { + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) i_size_write(inode, end_pos); - /* we've only changed i_size in ram, and we haven't updated - * the disk i_size. There is no need to log the inode - * at this time. - */ - } - return err; + return 0; } /* * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 len = end - start + 1; + u64 gen; int ret; int testend = 1; unsigned long flags; int compressed = 0; + bool modified; WARN_ON(end < start); if (end == (u64)-1) { @@ -168,10 +553,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, testend = 0; } while (1) { + int no_splits = 0; + + modified = false; if (!split) - split = alloc_extent_map(GFP_NOFS); + split = alloc_extent_map(); if (!split2) - split2 = alloc_extent_map(GFP_NOFS); + split2 = alloc_extent_map(); + if (!split || !split2) + no_splits = 1; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -180,6 +570,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, break; } flags = em->flags; + gen = em->generation; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { if (testend && em->start + em->len >= start + len) { free_extent_map(em); @@ -195,52 +586,90 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); - remove_extent_mapping(em_tree, em); + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + if (no_splits) + goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; split->bdev = em->bdev; split->flags = flags; - ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; split->len = em->start + em->len - (start + len); split->bdev = em->bdev; split->flags = flags; - - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; + split->ram_bytes = split->len; split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } +next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -252,7 +681,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); if (split2) free_extent_map(split2); - return 0; } /* @@ -264,15 +692,19 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache) +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_path *path; struct btrfs_key key; struct btrfs_key new_key; + u64 ino = btrfs_ino(inode); u64 search_start = start; u64 disk_bytenr = 0; u64 num_bytes = 0; @@ -283,28 +715,34 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int extent_type; int recow; int ret; + int modify_tree = -1; + int update_refs; + int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) + modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; - ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, - search_start, -1); + ret = btrfs_lookup_file_extent(trans, root, path, ino, + search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); - if (key.objectid == inode->i_ino && + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -316,12 +754,13 @@ next_slot: ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid > inode->i_ino || + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) break; @@ -338,20 +777,35 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { WARN_ON(1); extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; } + found = 1; search_start = max(key.offset, start); - if (recow) { - btrfs_release_path(root, path); + if (recow || !modify_tree) { + modify_tree = -1; + btrfs_release_path(path); continue; } @@ -361,14 +815,17 @@ next_slot: */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { - btrfs_release_path(root, path); + btrfs_release_path(path); continue; } if (ret < 0) @@ -389,14 +846,13 @@ next_slot: extent_end - start); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset); - BUG_ON(ret); - *hint_byte = disk_bytenr; + start - extent_offset, 1); + BUG_ON(ret); /* -ENOMEM */ } key.offset = start; } @@ -405,21 +861,22 @@ next_slot: * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, end - key.offset); - *hint_byte = disk_bytenr; - } break; } @@ -430,15 +887,16 @@ next_slot: */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, extent_end - start); - *hint_byte = disk_bytenr; - } if (end == extent_end) break; @@ -451,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -459,21 +918,21 @@ next_slot: del_nr++; } - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, extent_end - key.offset); extent_end = ALIGN(extent_end, root->sectorsize); - } else if (disk_bytenr > 0) { + } else if (update_refs && disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset); - BUG_ON(ret); + extent_offset, 0); + BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); - *hint_byte = disk_bytenr; } if (end == extent_end) @@ -486,23 +945,83 @@ next_slot: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } del_nr = 0; del_slot = 0; - btrfs_release_path(root, path); + btrfs_release_path(path); continue; } BUG_ON(1); } - if (del_nr > 0) { + if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if replace_extent != 0). + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } + + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); + if (drop_end) + *drop_end = found ? min(end, extent_end) : end; + return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -567,26 +1086,27 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_slot = 0; int recow; int ret; - - btrfs_drop_extent_cache(inode, start, end - 1, 0); + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; again: recow = 0; split = start; - key.objectid = inode->i_ino; + key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.objectid != inode->i_ino || - key.type != BTRFS_EXTENT_DATA_KEY); + BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); BUG_ON(btrfs_file_extent_type(leaf, fi) != @@ -603,18 +1123,22 @@ again: other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, - inode->i_ino, bytenr, orig_offset, + ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); btrfs_mark_buffer_dirty(leaf); @@ -626,18 +1150,22 @@ again: other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, - inode->i_ino, bytenr, orig_offset, + ino, bytenr, orig_offset, &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, @@ -654,20 +1182,25 @@ again: new_key.offset = split; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); @@ -675,8 +1208,8 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); + ino, orig_offset, 1); + BUG_ON(ret); /* -ENOMEM */ if (split == start) { key.offset = start; @@ -691,10 +1224,10 @@ again: other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, - inode->i_ino, bytenr, orig_offset, + ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } extent_end = other_end; @@ -702,16 +1235,16 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); + ino, orig_offset, 0); + BUG_ON(ret); /* -ENOMEM */ } other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, - inode->i_ino, bytenr, orig_offset, + ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } key.offset = other_start; @@ -719,26 +1252,31 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); + ino, orig_offset, 0); + BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_mark_buffer_dirty(leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } out: btrfs_free_path(path); @@ -746,245 +1284,526 @@ out: } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - unsigned long last_index, size_t write_bytes) +static int prepare_uptodate_page(struct page *page, u64 pos, + bool force_uptodate) +{ + int ret = 0; + + if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && + !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* + * this just gets pages into the page cache and locks them down. + */ +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = fdentry(file)->d_inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - u64 start_pos; - u64 last_pos; + int faili; - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - - if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, start_pos); - if (err) - return err; - } - - memset(pages, 0, num_pages * sizeof(struct page *)); -again: for (i = 0; i < num_pages; i++) { - pages[i] = grab_cache_page(inode->i_mapping, index + i); + pages[i] = find_or_create_page(inode->i_mapping, index + i, + mask | __GFP_WRITE); if (!pages[i]) { + faili = i - 1; err = -ENOMEM; - BUG_ON(1); + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos, + force_uptodate); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes, false); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state, - GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { - btrfs_put_ordered_extent(ordered); + ordered->file_offset <= last_pos) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - goto again; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; + + return ret; } -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) { - loff_t pos; - loff_t start_pos; - ssize_t num_written = 0; - ssize_t err = 0; - int ret = 0; - struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; - int nrptrs; - struct page *pinned[2]; - unsigned long first_index; - unsigned long last_index; - int will_write; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; - will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || - (file->f_flags & O_DIRECT)); + ret = btrfs_start_nocow_write(root); + if (!ret) + return -ENOSPC; - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); - pinned[0] = NULL; - pinned[1] = NULL; + lockstart = round_down(pos, root->sectorsize); + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; - pos = *ppos; - start_pos = pos; + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + if (ret <= 0) { + ret = 0; + btrfs_end_nocow_write(root); + } else { + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); + } - /* do the reserve before the mutex lock in case we have to do some - * flushing. We wouldn't deadlock, but this is more polite. - */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); - mutex_lock(&inode->i_mutex); + return ret; +} - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; +static noinline ssize_t __btrfs_buffered_write(struct file *file, + struct iov_iter *i, + loff_t pos) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + struct extent_state *cached_state = NULL; + u64 release_bytes = 0; + u64 lockstart; + u64 lockend; + unsigned long first_index; + size_t num_written = 0; + int nrptrs; + int ret = 0; + bool only_release_metadata = false; + bool force_page_uptodate = false; + bool need_unlock; + + nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; - if (count == 0) - goto out; + first_index = pos >> PAGE_CACHE_SHIFT; - err = file_remove_suid(file); - if (err) - goto out; + while (iov_iter_count(i) > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(iov_iter_count(i), + nrptrs * (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t reserve_bytes; + size_t dirty_pages; + size_t copied; - file_update_time(file); + WARN_ON(num_pages > nrptrs); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + ret = -EFAULT; + break; + } - /* generic_write_checks can change our pos */ - start_pos = pos; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } - BTRFS_I(inode)->sequence++; - first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + if (ret) + break; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + else + btrfs_end_nocow_write(root); + break; } - } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); + + release_bytes = reserve_bytes; + need_unlock = false; +again: + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, + force_page_uptodate); + if (ret) + break; + + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; } - } - while (count > 0) { - size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + copied = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, i); - WARN_ON(num_pages > nrptrs); - memset(pages, 0, sizeof(struct page *) * nrptrs); + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; - ret = btrfs_check_data_free_space(root, inode, write_bytes); - if (ret) - goto out; + if (copied == 0) { + force_page_uptodate = true; + dirty_pages = 0; + } else { + force_page_uptodate = false; + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + } - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, last_index, - write_bytes); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - goto out; + /* + * If we had a short copy we need to release the excess delaloc + * bytes we reserved. We need to increment outstanding_extents + * because btrfs_delalloc_release_space will decrement it, but + * we still have an outstanding extent for the chunk we actually + * managed to copy. + */ + if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; + if (copied > 0) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); } - ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; + + if (copied > 0) + ret = btrfs_dirty_pages(root, inode, pages, + dirty_pages, pos, copied, + NULL); + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); btrfs_drop_pages(pages, num_pages); - goto out; + break; } - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); - btrfs_drop_pages(pages, num_pages); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - goto out; + release_bytes = 0; + if (only_release_metadata) + btrfs_end_nocow_write(root); + + if (only_release_metadata && copied > 0) { + u64 lockstart = round_down(pos, root->sectorsize); + u64 lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; } - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1); + btrfs_drop_pages(pages, num_pages); + + cond_resched(); + + balance_dirty_pages_ratelimited(inode->i_mapping); + if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root); + + pos += copied; + num_written += copied; + } + + kfree(pages); + + if (release_bytes) { + if (only_release_metadata) { + btrfs_end_nocow_write(root); + btrfs_delalloc_release_metadata(inode, release_bytes); } else { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - num_pages); - if (num_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); + btrfs_delalloc_release_space(inode, release_bytes); } + } - buf += write_bytes; - count -= write_bytes; - pos += write_bytes; - num_written += write_bytes; + return num_written ? num_written : ret; +} - cond_resched(); +static ssize_t __btrfs_direct_write(struct kiocb *iocb, + struct iov_iter *from, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t written; + ssize_t written_buffered; + loff_t endbyte; + int err; + + written = generic_file_direct_write(iocb, from, pos); + + if (written < 0 || !iov_iter_count(from)) + return written; + + pos += written; + written_buffered = __btrfs_buffered_write(file, from, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; } + endbyte = pos + written_buffered - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); out: - mutex_unlock(&inode->i_mutex); - if (ret) - err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + return written ? written : err; +} -out_nolock: - kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); - *ppos = pos; +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start_pos; + u64 end_pos; + ssize_t num_written = 0; + ssize_t err = 0; + size_t count = iov_iter_count(from); + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + loff_t pos = iocb->ki_pos; + + mutex_lock(&inode->i_mutex); + + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + if (count == 0) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + iov_iter_truncate(from, count); + + err = file_remove_suid(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + mutex_unlock(&inode->i_mutex); + err = -EROFS; + goto out; + } + + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); + + start_pos = round_down(pos, root->sectorsize); + if (start_pos > i_size_read(inode)) { + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(iocb, from, pos); + } else { + num_written = __btrfs_buffered_write(file, from, pos); + if (num_written > 0) + iocb->ki_pos = pos + num_written; + } + + mutex_unlock(&inode->i_mutex); /* * we want to make sure fsync finds this change @@ -997,38 +1816,22 @@ out_nolock: * this will either be one more than the running transaction * or the generation used for the next transaction if there isn't * one running right now. + * + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; - - if (num_written > 0 && will_write) { - struct btrfs_trans_handle *trans; - - err = btrfs_wait_ordered_range(inode, start_pos, num_written); - if (err) + BTRFS_I(inode)->last_sub_trans = root->log_transid; + if (num_written > 0) { + err = generic_write_sync(file, pos, num_written); + if (err < 0) num_written = err; - - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 1); - ret = btrfs_log_dentry_safe(trans, root, - file->f_dentry); - if (ret == 0) { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - btrfs_end_transaction(trans, root); - else - btrfs_commit_transaction(trans, root); - } else if (ret != BTRFS_NO_LOG_SYNC) { - btrfs_commit_transaction(trans, root); - } else { - btrfs_end_transaction(trans, root); - } - } - if (file->f_flags & O_DIRECT) { - invalidate_mapping_pages(inode->i_mapping, - start_pos >> PAGE_CACHE_SHIFT, - (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); - } } + + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); +out: current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -1041,9 +1844,22 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We need to block on a committing transaction to keep us from + * throwing a ordered operation on to the list and causing + * something like sync to deadlock trying to flush out this + * inode. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); + btrfs_end_transaction(trans, root); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); } @@ -1063,56 +1879,114 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; + bool full_sync = 0; + trace_btrfs_sync_file(file, datasync); + + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. + */ + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + if (ret) + return ret; - /* we wait first, since the writeback may change the inode */ - root->log_batch++; - /* the VFS called filemap_fdatawrite for us */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; + mutex_lock(&inode->i_mutex); + + /* + * We flush the dirty pages again to avoid some dirty pages in the + * range being left. + */ + atomic_inc(&root->log_batch); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + atomic_inc(&root->log_batch); /* * check the transaction that last modified this inode * and see if its already been committed */ - if (!BTRFS_I(inode)->last_trans) + if (!BTRFS_I(inode)->last_trans) { + mutex_unlock(&inode->i_mutex); goto out; + } /* * if the last transaction that changed this file was before * the current transaction, we can bail out now without any * syncing */ - mutex_lock(&root->fs_info->trans_mutex); - if (BTRFS_I(inode)->last_trans <= + smp_mb(); + if (btrfs_inode_in_log(inode, root->fs_info->generation) || + BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; - mutex_unlock(&root->fs_info->trans_mutex); + + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + mutex_unlock(&inode->i_mutex); goto out; } - mutex_unlock(&root->fs_info->trans_mutex); /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file && file->private_data) + if (file->private_data) btrfs_ioctl_trans_end(file); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); - if (ret < 0) - goto out; + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + if (ret < 0) { + /* Fallthrough and commit/free transaction. */ + ret = 1; + } /* we've logged all the items and now have a consistent * version of the file in the log. It is possible that @@ -1124,50 +1998,763 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); if (ret != BTRFS_NO_LOG_SYNC) { - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); + if (!ret) { ret = btrfs_end_transaction(trans, root); - else - ret = btrfs_commit_transaction(trans, root); + goto out; + } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, + end - start + 1); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; + } + } + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_end_transaction(trans, root); } - mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? -EIO : ret; } static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { - vma->vm_ops = &btrfs_file_vm_ops; + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + return 0; } +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key key; + int ret; + + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + return ret; + BUG_ON(!ret); + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + u64 num_bytes; + + path->slots[0]++; + key.offset = offset; + btrfs_set_item_key_safe(root, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, end - offset, 0, end - offset, + 0, 0, 0); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->ram_bytes = hole_em->len; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + do { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, hole_em, 1); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + free_extent_map(hole_em); + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } + + return 0; +} + +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_block_rsv *rsv; + struct btrfs_trans_handle *trans; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 drop_end; + int ret = 0; + int err = 0; + int rsv_count; + bool same_page; + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size; + + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ + /* + * Only do this if we are in the same page and we aren't doing the + * entire page. + */ + if (same_page && len < PAGE_CACHE_SIZE) { + if (offset < ino_size) + ret = btrfs_truncate_page(inode, offset, len, 0); + goto out_only_mutex; + } + + /* zero back part of the first page */ + if (offset < ino_size) { + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } + } + } + + if (lockend < lockstart) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_NOFS); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out_free; + } + rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set + */ + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + trans->block_rsv = rsv; + + cur_offset = lockstart; + len = lockend - cur_offset; + while (cur_offset < lockend) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, lockend + 1, + &drop_end, 1, 0, 0, NULL); + if (ret != -ENOSPC) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + + if (ret) { + err = ret; + goto out_trans; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); +out_free: + btrfs_free_path(path); + btrfs_free_block_rsv(root, rsv); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); +out_only_mutex: + mutex_unlock(&inode->i_mutex); + if (ret && !err) + err = ret; + return err; +} + +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; + int ret; + + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); + + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(inode, offset, len); + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); + if (ret) + goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; + } + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + u64 actual_end; + + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + break; + } + last_byte = min(extent_map_end(em), alloc_end); + actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = ALIGN(last_byte, blocksize); + + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + + if (ret < 0) { + free_extent_map(em); + break; + } + } else if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size. + */ + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + return ret; +} + +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + u64 lockstart = *offset; + u64 lockend = i_size_read(inode); + u64 start = *offset; + u64 len = i_size_read(inode); + int ret = 0; + + lockend = max_t(u64, root->sectorsize, lockend); + if (lockend <= lockstart) + lockend = lockstart + root->sectorsize; + + lockend--; + len = lockend - lockstart + 1; + + len = max_t(u64, len, root->sectorsize); + if (inode->i_size == 0) + return -ENXIO; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + &cached_state); + + while (start < inode->i_size) { + em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + em = NULL; + break; + } + + if (whence == SEEK_HOLE && + (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + else if (whence == SEEK_DATA && + (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + + start = em->start + em->len; + free_extent_map(em); + em = NULL; + cond_resched(); + } + free_extent_map(em); + if (!ret) { + if (whence == SEEK_DATA && start >= inode->i_size) + ret = -ENXIO; + else + *offset = min_t(loff_t, start, inode->i_size); + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + return ret; +} + +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + switch (whence) { + case SEEK_END: + case SEEK_CUR: + offset = generic_file_llseek(file, offset, whence); + goto out; + case SEEK_DATA: + case SEEK_HOLE: + if (offset >= i_size_read(inode)) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + ret = find_desired_extent(inode, &offset, whence); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + const struct file_operations btrfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .llseek = btrfs_file_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif }; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f488fac04d9..2b0a627cb5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,37 +20,1259 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" +#include "disk-io.h" +#include "extent_io.h" +#include "inode-map.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) -static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); + +static struct inode *__lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_path *path, + u64 offset) +{ + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(path); + + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + return inode; +} + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path) +{ + struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, + block_group->key.objectid); + if (IS_ERR(inode)) + return inode; + + spin_lock(&block_group->lock); + if (!((BTRFS_I(inode)->flags & flags) == flags)) { + btrfs_info(root->fs_info, + "Old style space inode found, converting."); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + + if (!block_group->iref) { + block_group->inode = igrab(inode); + block_group->iref = 1; + } + spin_unlock(&block_group->lock); + + return inode; +} + +static int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 ino, u64 offset) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; + int ret; + + ret = btrfs_insert_empty_inode(trans, root, path, ino); + if (ret) + return ret; + + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, flags); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, offset); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; +} + +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + int ret; + u64 ino; + + ret = btrfs_find_free_objectid(root, &ino); + if (ret < 0) + return ret; + + return __create_free_space_inode(root, trans, path, ino, + block_group->key.objectid); +} + +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct inode *inode) +{ + int ret = 0; + + btrfs_i_size_write(inode, 0); + truncate_pagecache(inode, 0); + + /* + * We don't need an orphan item because truncating the free space cache + * will never be split across transactions. + */ + ret = btrfs_truncate_inode_items(trans, root, inode, + 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); + + return ret; +} + +static int readahead_cache(struct inode *inode) +{ + struct file_ra_state *ra; + unsigned long last_index; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + file_ra_state_init(ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); + + kfree(ra); + + return 0; +} + +struct io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + unsigned long size; + int index; + int num_pages; + unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root, int write) +{ + int num_pages; + int check_crcs = 0; + + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + check_crcs = 1; + + /* Make sure we can fit our crcs into the first page */ + if (write && check_crcs && + (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + return -ENOSPC; + + memset(io_ctl, 0, sizeof(struct io_ctl)); + + io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + + io_ctl->num_pages = num_pages; + io_ctl->root = root; + io_ctl->check_crcs = check_crcs; + + return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ + kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ + if (io_ctl->cur) { + kunmap(io_ctl->page); + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ + ASSERT(io_ctl->index < io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = kmap(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } + } +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + + return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ + __le64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ + __le64 *gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "BTRFS: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + crc = btrfs_csum_data(io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + + return 0; +} + +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries. This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache. So run through the space cache that we just + * loaded and merge contiguous entries. This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *e, *prev = NULL; + struct rb_node *n; + +again: + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + e = rb_entry(n, struct btrfs_free_space, offset_index); + if (!prev) + goto next; + if (e->bitmap || prev->bitmap) + goto next; + if (prev->offset + prev->bytes == e->offset) { + unlink_free_space(ctl, prev); + unlink_free_space(ctl, e); + prev->bytes += e->bytes; + kmem_cache_free(btrfs_free_space_cachep, e); + link_free_space(ctl, prev); + prev = NULL; + spin_unlock(&ctl->tree_lock); + goto again; + } +next: + prev = e; + } + spin_unlock(&ctl->tree_lock); +} + +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct io_ctl io_ctl; + struct btrfs_key key; + struct btrfs_free_space *e, *n; + struct list_head bitmaps; + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u8 type; + int ret = 0; + + INIT_LIST_HEAD(&bitmaps); + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) + return 0; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return 0; + else if (ret > 0) { + btrfs_release_path(path); + return 0; + } + + ret = -1; + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_release_path(path); + + if (!BTRFS_I(inode)->generation) { + btrfs_info(root->fs_info, + "The free space cache file (%llu) is invalid. skip it\n", + offset); + return 0; + } + + if (BTRFS_I(inode)->generation != generation) { + btrfs_err(root->fs_info, + "free space inode generation (%llu) " + "did not match free space cache generation (%llu)", + BTRFS_I(inode)->generation, generation); + return 0; + } + + if (!num_entries) + return 0; + + ret = io_ctl_init(&io_ctl, inode, root, 0); + if (ret) + return ret; + + ret = readahead_cache(inode); + if (ret) + goto out; + + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; + + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; + + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) + goto free_cache; + + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + } else { + ASSERT(num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); + goto free_cache; + } + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + } + + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { + list_del_init(&e->list); + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; + } + + io_ctl_drop_pages(&io_ctl); + merge_space_tree(ctl); + ret = 1; +out: + io_ctl_free(&io_ctl); + return ret; +free_cache: + io_ctl_drop_pages(&io_ctl); + __btrfs_remove_free_space_cache(ctl); + goto out; +} + +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode; + struct btrfs_path *path; + int ret = 0; + bool matched; + u64 used = btrfs_block_group_used(&block_group->item); + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + path = btrfs_alloc_path(); + if (!path) + return 0; + path->search_commit_root = 1; + path->skip_locking = 1; + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + btrfs_free_path(path); + goto out; + } + spin_unlock(&block_group->lock); + + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + path, block_group->key.objectid); + btrfs_free_path(path); + if (ret <= 0) + goto out; + + spin_lock(&ctl->tree_lock); + matched = (ctl->free_space == (block_group->key.offset - used - + block_group->bytes_super)); + spin_unlock(&ctl->tree_lock); + + if (!matched) { + __btrfs_remove_free_space_cache(ctl); + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", + block_group->key.objectid); + ret = -1; + } +out: + if (ret < 0) { + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + ret = 0; + + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", + block_group->key.objectid); + } + + iput(inode); + return ret; +} + +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) +{ + int ret; + struct btrfs_free_cluster *cluster = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); + + /* Get the cluster for this block_group if it exists */ + if (block_group && !list_empty(&block_group->cluster_list)) { + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + } + + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } + + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + *entries += 1; + + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto fail; + + if (e->bitmap) { + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; + } + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } + } + return 0; +fail: + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + * + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + start = block_group->key.objectid; + + while (start < block_group->key.objectid + block_group->key.offset) { + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL); + if (ret) + return 0; + + /* This pinned extent is out of our range */ + if (extent_start >= block_group->key.objectid + + block_group->key.offset) + return 0; + + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; + + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); + if (ret) + return -ENOSPC; + + start = extent_end; + } + + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + int ret; + + /* Write out the bitmaps */ + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); + if (ret) + return -ENOSPC; + list_del_init(&entry->list); + } + + return 0; +} + +static int flush_dirty_cache(struct inode *inode) +{ + int ret; + + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + + return ret; +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + struct io_ctl io_ctl; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + + if (!i_size_read(inode)) + return -1; + + ret = io_ctl_init(&io_ctl, inode, root, 1); + if (ret) + return -1; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; + goto out; + } + spin_unlock(&block_group->lock); + } + + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); + + io_ctl_set_generation(&io_ctl, trans->transid); + + /* Write out the extent entries in the free space cache */ + ret = write_cache_extent_entries(&io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + */ + ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); + if (ret) + goto out_nospc; + + /* At last, we write out all the bitmaps. */ + ret = write_bitmap_entries(&io_ctl, &bitmap_list); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(&io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(&io_ctl); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + entries, bitmaps); +out: + io_ctl_free(&io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + return ret; + +out_nospc: + cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + + goto out; +} + +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct inode *inode; + int ret = 0; + + root = root->fs_info->tree_root; + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) + return 0; + + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + path, block_group->key.objectid); + if (ret) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + ret = 0; +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + + iput(inode); + return ret; +} + +static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, u64 offset) { - BUG_ON(offset < bitmap_start); + ASSERT(offset >= bitmap_start); offset -= bitmap_start; - return (unsigned long)(div64_u64(offset, sectorsize)); + return (unsigned long)(div_u64(offset, unit)); } -static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) { - return (unsigned long)(div64_u64(bytes, sectorsize)); + return (unsigned long)(div_u64(bytes, unit)); } -static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, +static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; u64 bytes_per_bitmap; - bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; - bitmap_start = offset - block_group->key.objectid; + bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; + bitmap_start = offset - ctl->start; bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; - bitmap_start += block_group->key.objectid; + bitmap_start += ctl->start; return bitmap_start; } @@ -85,10 +1307,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * logically. */ if (bitmap) { - WARN_ON(info->bitmap); + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_right; } else { - WARN_ON(!info->bitmap); + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_left; } } @@ -108,10 +1336,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * offset. */ static struct btrfs_free_space * -tree_search_offset(struct btrfs_block_group_cache *block_group, +tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { - struct rb_node *n = block_group->free_space_offset.rb_node; + struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry, *prev = NULL; /* find entry that is closest to the 'offset' */ @@ -157,18 +1385,13 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - entry = prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; } } return entry; @@ -184,7 +1407,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, if (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(entry->offset > offset); + ASSERT(entry->offset <= offset); } else { if (fuzzy) return entry; @@ -194,21 +1417,15 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, } if (entry->bitmap) { - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - return prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; } - if (entry->offset + BITS_PER_BITMAP * - block_group->sectorsize > offset) + if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; } else if (entry->offset + entry->bytes > offset) return entry; @@ -219,7 +1436,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, while (1) { if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * - block_group->sectorsize > offset) + ctl->unit > offset) break; } else { if (entry->offset + entry->bytes > offset) @@ -234,53 +1451,71 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, return entry; } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, +static inline void +__unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &ctl->free_space_offset); + ctl->free_extents--; +} + +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - rb_erase(&info->offset_index, &block_group->free_space_offset); - block_group->free_extents--; - block_group->free_space -= info->bytes; + __unlink_free_space(ctl, info); + ctl->free_space -= info->bytes; } -static int link_free_space(struct btrfs_block_group_cache *block_group, +static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { int ret = 0; - BUG_ON(!info->bitmap && !info->bytes); - ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + ASSERT(info->bytes || info->bitmap); + ret = tree_insert_offset(&ctl->free_space_offset, info->offset, &info->offset_index, (info->bitmap != NULL)); if (ret) return ret; - block_group->free_space += info->bytes; - block_group->free_extents++; + ctl->free_space += info->bytes; + ctl->free_extents++; return ret; } -static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { + struct btrfs_block_group_cache *block_group = ctl->private; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; + u64 size = block_group->key.offset; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + + max_bitmaps = max(max_bitmaps, 1); + + ASSERT(ctl->total_bitmaps <= max_bitmaps); /* * The goal is to keep the total amount of memory used per 1gb of space * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - max_bytes = MAX_CACHE_BYTES_PER_GIG * - (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; if (bitmap_bytes >= max_bytes) { - block_group->extents_thresh = 0; + ctl->extents_thresh = 0; return; } @@ -291,134 +1526,180 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) extent_bytes = max_bytes - bitmap_bytes; extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); - block_group->extents_thresh = + ctl->extents_thresh = div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); } -static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info, u64 offset, - u64 bytes) +static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes) { - unsigned long start, end; - unsigned long i; + unsigned long start, count; - start = offset_to_bit(info->offset, block_group->sectorsize, offset); - end = start + bytes_to_bits(bytes, block_group->sectorsize); - BUG_ON(end > BITS_PER_BITMAP); + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + ASSERT(start + count <= BITS_PER_BITMAP); - for (i = start; i < end; i++) - clear_bit(i, info->bitmap); + bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; - block_group->free_space -= bytes; } -static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + __bitmap_clear_bits(ctl, info, offset, bytes); + ctl->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, end; - unsigned long i; + unsigned long start, count; - start = offset_to_bit(info->offset, block_group->sectorsize, offset); - end = start + bytes_to_bits(bytes, block_group->sectorsize); - BUG_ON(end > BITS_PER_BITMAP); + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + ASSERT(start + count <= BITS_PER_BITMAP); - for (i = start; i < end; i++) - set_bit(i, info->bitmap); + bitmap_set(info->bitmap, start, count); info->bytes += bytes; - block_group->free_space += bytes; + ctl->free_space += bytes; } -static int search_bitmap(struct btrfs_block_group_cache *block_group, +/* + * If we can not find suitable extent, we will use bytes to record + * the size of the max extent. + */ +static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) { unsigned long found_bits = 0; + unsigned long max_bits = 0; unsigned long bits, i; unsigned long next_zero; + unsigned long extent_bits; - i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); - bits = bytes_to_bits(*bytes, block_group->sectorsize); + bits = bytes_to_bits(*bytes, ctl->unit); - for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - if ((next_zero - i) >= bits) { - found_bits = next_zero - i; + extent_bits = next_zero - i; + if (extent_bits >= bits) { + found_bits = extent_bits; break; + } else if (extent_bits > max_bits) { + max_bits = extent_bits; } i = next_zero; } if (found_bits) { - *offset = (u64)(i * block_group->sectorsize) + - bitmap_info->offset; - *bytes = (u64)(found_bits) * block_group->sectorsize; + *offset = (u64)(i * ctl->unit) + bitmap_info->offset; + *bytes = (u64)(found_bits) * ctl->unit; return 0; } + *bytes = (u64)(max_bits) * ctl->unit; return -1; } -static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache - *block_group, u64 *offset, - u64 *bytes, int debug) +/* Cache the size of the max extent in bytes */ +static struct btrfs_free_space * +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align, u64 *max_extent_size) { struct btrfs_free_space *entry; struct rb_node *node; + u64 tmp; + u64 align_off; int ret; - if (!block_group->free_space_offset.rb_node) - return NULL; + if (!ctl->free_space_offset.rb_node) + goto out; - entry = tree_search_offset(block_group, - offset_to_bitmap(block_group, *offset), - 0, 1); + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) - return NULL; + goto out; for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bytes < *bytes) + if (entry->bytes < *bytes) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; + continue; + } + + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + tmp = entry->offset - ctl->start + align - 1; + do_div(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + if (entry->bytes < *bytes + align_off) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; continue; + } if (entry->bitmap) { - ret = search_bitmap(block_group, entry, offset, bytes); - if (!ret) + u64 size = *bytes; + + ret = search_bitmap(ctl, entry, &tmp, &size); + if (!ret) { + *offset = tmp; + *bytes = size; return entry; + } else if (size > *max_extent_size) { + *max_extent_size = size; + } continue; } - *offset = entry->offset; - *bytes = entry->bytes; + *offset = tmp; + *bytes = entry->bytes - align_off; return entry; } - +out: return NULL; } -static void add_new_bitmap(struct btrfs_block_group_cache *block_group, +static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; - int max_bitmaps = (int)div64_u64(block_group->key.offset + - bytes_per_bg - 1, bytes_per_bg); - BUG_ON(block_group->total_bitmaps >= max_bitmaps); - - info->offset = offset_to_bitmap(block_group, offset); + info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; - link_free_space(block_group, info); - block_group->total_bitmaps++; + INIT_LIST_HEAD(&info->list); + link_free_space(ctl, info); + ctl->total_bitmaps++; + + ctl->op->recalc_thresholds(ctl); +} - recalculate_thresholds(block_group); +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(ctl, bitmap_info); + kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); } -static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, +static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) { @@ -427,44 +1708,35 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro int ret; again: - end = bitmap_info->offset + - (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; /* - * XXX - this can go away after a few releases. - * - * since the only user of btrfs_remove_free_space is the tree logging - * stuff, and the only way to test that is under crash conditions, we - * want to have this debug stuff here just in case somethings not - * working. Search the bitmap for the space we are trying to use to - * make sure its actually there. If its not there then we need to stop - * because something has gone wrong. + * We need to search for bits in this bitmap. We could only cover some + * of the extent in this bitmap thanks to how we add space, so we need + * to search for as much as it as we can and clear that amount, and then + * go searching for the next bit. */ search_start = *offset; - search_bytes = *bytes; - ret = search_bitmap(block_group, bitmap_info, &search_start, - &search_bytes); - BUG_ON(ret < 0 || search_start != *offset); + search_bytes = ctl->unit; + search_bytes = min(search_bytes, end - search_start + 1); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + if (ret < 0 || search_start != *offset) + return -EINVAL; - if (*offset > bitmap_info->offset && *offset + *bytes > end) { - bitmap_clear_bits(block_group, bitmap_info, *offset, - end - *offset + 1); - *bytes -= end - *offset + 1; - *offset = end + 1; - } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { - bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); - *bytes = 0; - } + /* We may have found more bits than what we need */ + search_bytes = min(search_bytes, *bytes); + + /* Cannot clear past the end of the bitmap */ + search_bytes = min(search_bytes, end - search_start + 1); + + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + *offset += search_bytes; + *bytes -= search_bytes; if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); /* * no entry after this bitmap, but we still have bytes to @@ -490,75 +1762,149 @@ again: * everything over again. */ search_start = *offset; - search_bytes = *bytes; - ret = search_bitmap(block_group, bitmap_info, &search_start, + search_bytes = ctl->unit; + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); if (ret < 0 || search_start != *offset) return -EAGAIN; goto again; - } else if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + } else if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); return 0; } -static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) { - struct btrfs_free_space *bitmap_info; - int added = 0; - u64 bytes, offset, end; - int ret; + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_block_group_cache *block_group = ctl->private; /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (block_group->free_extents < block_group->extents_thresh && - info->bytes > block_group->sectorsize * 4) - return 0; + if (ctl->free_extents < ctl->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plent + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= block_group->sectorsize * 4) { + if (ctl->free_extents * 2 <= ctl->extents_thresh) + return false; + } else { + return false; + } + } /* - * some block groups are so tiny they can't be enveloped by a bitmap, so - * don't even bother to create a bitmap for this + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. */ - if (BITS_PER_BITMAP * block_group->sectorsize > - block_group->key.offset) - return 0; + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) + return false; + + return true; +} + +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; + int added = 0; + u64 bytes, offset, bytes_added; + int ret; bytes = info->bytes; offset = info->offset; + if (!ctl->op->use_bitmap(ctl, info)) + return 0; + + if (ctl->op == &free_space_op) + block_group = ctl->private; again: - bitmap_info = tree_search_offset(block_group, - offset_to_bitmap(block_group, offset), + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } + +no_cluster_bitmap: + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { - BUG_ON(added); + ASSERT(added == 0); goto new_bitmap; } - end = bitmap_info->offset + - (u64)(BITS_PER_BITMAP * block_group->sectorsize); - - if (offset >= bitmap_info->offset && offset + bytes > end) { - bitmap_set_bits(block_group, bitmap_info, offset, - end - offset); - bytes -= end - offset; - offset = end; - added = 0; - } else if (offset >= bitmap_info->offset && offset + bytes <= end) { - bitmap_set_bits(block_group, bitmap_info, offset, bytes); - bytes = 0; - } else { - BUG(); - } + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; if (!bytes) { ret = 1; @@ -568,19 +1914,19 @@ again: new_bitmap: if (info && info->bitmap) { - add_new_bitmap(block_group, info, offset); + add_new_bitmap(ctl, info, offset); added = 1; info = NULL; goto again; } else { - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); /* no pre-allocated info, allocate a new one */ if (!info) { - info = kzalloc(sizeof(struct btrfs_free_space), - GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!info) { - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); ret = -ENOMEM; goto out; } @@ -588,7 +1934,7 @@ new_bitmap: /* allocate the bitmap */ info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; goto out; @@ -600,81 +1946,98 @@ out: if (info) { if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *right_info = NULL; - struct btrfs_free_space *left_info = NULL; - struct btrfs_free_space *info = NULL; - int ret = 0; - - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&block_group->tree_lock); + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ - right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + right_info = tree_search_offset(ctl, offset + bytes, 0, 0); if (right_info && rb_prev(&right_info->offset_index)) left_info = rb_entry(rb_prev(&right_info->offset_index), struct btrfs_free_space, offset_index); else - left_info = tree_search_offset(block_group, offset - 1, 0, 0); - - /* - * If there was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - if ((!left_info || left_info->bitmap) && - (!right_info || right_info->bitmap)) { - ret = insert_into_bitmap(block_group, info); - - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } - } + left_info = tree_search_offset(ctl, offset - 1, 0, 0); if (right_info && !right_info->bitmap) { - unlink_free_space(block_group, right_info); + if (update_stat) + unlink_free_space(ctl, right_info); + else + __unlink_free_space(ctl, right_info); info->bytes += right_info->bytes; - kfree(right_info); + kmem_cache_free(btrfs_free_space_cachep, right_info); + merged = true; } if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset) { - unlink_free_space(block_group, left_info); + if (update_stat) + unlink_free_space(ctl, left_info); + else + __unlink_free_space(ctl, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; - kfree(left_info); + kmem_cache_free(btrfs_free_space_cachep, left_info); + merged = true; } - ret = link_free_space(block_group, info); + return merged; +} + +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&ctl->tree_lock); + + if (try_merge_free_space(ctl, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(ctl, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: + ret = link_free_space(ctl, info); if (ret) - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); out: - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); - BUG_ON(ret == -EEXIST); + printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); + ASSERT(ret != -EEXIST); } return ret; @@ -683,116 +2046,89 @@ out: int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - struct btrfs_free_space *next_info = NULL; - int ret = 0; + int ret; + bool re_search = false; - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); again: - info = tree_search_offset(block_group, offset, 0, 0); + ret = 0; + if (!bytes) + goto out_lock; + + info = tree_search_offset(ctl, offset, 0, 0); if (!info) { /* * oops didn't find an extent that matched the space we wanted * to remove, look for a bitmap instead */ - info = tree_search_offset(block_group, - offset_to_bitmap(block_group, offset), + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); - goto out_lock; - } - } - - if (info->bytes < bytes && rb_next(&info->offset_index)) { - u64 end; - next_info = rb_entry(rb_next(&info->offset_index), - struct btrfs_free_space, - offset_index); - - if (next_info->bitmap) - end = next_info->offset + BITS_PER_BITMAP * - block_group->sectorsize - 1; - else - end = next_info->offset + next_info->bytes; - - if (next_info->bytes < bytes || - next_info->offset > offset || offset > end) { - printk(KERN_CRIT "Found free space at %llu, size %llu," - " trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); - WARN_ON(1); - ret = -EINVAL; + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. + */ + WARN_ON(re_search); goto out_lock; } - - info = next_info; } - if (info->bytes == bytes) { - unlink_free_space(block_group, info); - if (info->bitmap) { - kfree(info->bitmap); - block_group->total_bitmaps--; - } - kfree(info); - goto out_lock; - } - - if (!info->bitmap && info->offset == offset) { - unlink_free_space(block_group, info); - info->offset += bytes; - info->bytes -= bytes; - link_free_space(block_group, info); - goto out_lock; - } + re_search = false; + if (!info->bitmap) { + unlink_free_space(ctl, info); + if (offset == info->offset) { + u64 to_free = min(bytes, info->bytes); + + info->bytes -= to_free; + info->offset += to_free; + if (info->bytes) { + ret = link_free_space(ctl, info); + WARN_ON(ret); + } else { + kmem_cache_free(btrfs_free_space_cachep, info); + } - if (!info->bitmap && info->offset <= offset && - info->offset + info->bytes >= offset + bytes) { - u64 old_start = info->offset; - /* - * we're freeing space in the middle of the info, - * this can happen during tree log replay - * - * first unlink the old info and then - * insert it again after the hole we're creating - */ - unlink_free_space(block_group, info); - if (offset + bytes < info->offset + info->bytes) { - u64 old_end = info->offset + info->bytes; + offset += to_free; + bytes -= to_free; + goto again; + } else { + u64 old_end = info->bytes + info->offset; - info->offset = offset + bytes; - info->bytes = old_end - info->offset; - ret = link_free_space(block_group, info); + info->bytes = offset - info->offset; + ret = link_free_space(ctl, info); WARN_ON(ret); if (ret) goto out_lock; - } else { - /* the hole we're creating ends at the end - * of the info struct, just free the info - */ - kfree(info); - } - spin_unlock(&block_group->tree_lock); - /* step two, insert a new info struct to cover - * anything before the hole - */ - ret = btrfs_add_free_space(block_group, old_start, - offset - old_start); - WARN_ON(ret); - goto out; + /* Not enough bytes in this entry to satisfy us */ + if (old_end < offset + bytes) { + bytes -= old_end - offset; + offset = old_end; + goto again; + } else if (old_end == offset + bytes) { + /* all done */ + goto out_lock; + } + spin_unlock(&ctl->tree_lock); + + ret = btrfs_add_free_space(block_group, offset + bytes, + old_end - (offset + bytes)); + WARN_ON(ret); + goto out; + } } - ret = remove_from_bitmap(block_group, info, &offset, &bytes); - if (ret == -EAGAIN) + ret = remove_from_bitmap(ctl, info, &offset, &bytes); + if (ret == -EAGAIN) { + re_search = true; goto again; - BUG_ON(ret); + } out_lock: - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); out: return ret; } @@ -800,38 +2136,43 @@ out: void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *n; int count = 0; - for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); - if (info->bytes >= bytes) + if (info->bytes >= bytes && !block_group->ro) count++; - printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, + btrfs_crit(block_group->fs_info, + "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } - printk(KERN_INFO "block group has cluster?: %s\n", + btrfs_info(block_group->fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); - printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" - "\n", count); + btrfs_info(block_group->fs_info, + "%d blocks of free space at or bigger than bytes is", count); } -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) { - struct btrfs_free_space *info; - struct rb_node *n; - u64 ret = 0; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - for (n = rb_first(&block_group->free_space_offset); n; - n = rb_next(n)) { - info = rb_entry(n, struct btrfs_free_space, offset_index); - ret += info->bytes; - } + spin_lock_init(&ctl->tree_lock); + ctl->unit = block_group->sectorsize; + ctl->start = block_group->key.objectid; + ctl->private = block_group; + ctl->op = &free_space_op; - return ret; + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + ctl->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); } /* @@ -845,31 +2186,31 @@ __btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; - bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; - bitmap = cluster->points_to_bitmap; cluster->block_group = NULL; cluster->window_start = 0; list_del_init(&cluster->block_group_list); - cluster->points_to_bitmap = false; - - if (bitmap) - goto out; node = rb_first(&cluster->root); while (node) { + bool bitmap; + entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - BUG_ON(entry->bitmap); - tree_insert_offset(&block_group->free_space_offset, - entry->offset, &entry->offset_index, 0); + + bitmap = (entry->bitmap != NULL); + if (!bitmap) + try_merge_free_space(ctl, entry, false); + tree_insert_offset(&ctl->free_space_offset, + entry->offset, &entry->offset_index, bitmap); } cluster->root = RB_ROOT; @@ -879,14 +2220,42 @@ out: return 0; } -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +static void __btrfs_remove_free_space_cache_locked( + struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + if (need_resched()) { + spin_unlock(&ctl->tree_lock); + cond_resched(); + spin_lock(&ctl->tree_lock); + } + } +} + +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + spin_lock(&ctl->tree_lock); + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; struct list_head *head; - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); while ((head = block_group->cluster_list.next) != &block_group->cluster_list) { cluster = list_entry(head, struct btrfs_free_cluster, @@ -895,63 +2264,57 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); if (need_resched()) { - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); cond_resched(); - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); } } + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); - while ((node = rb_last(&block_group->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - unlink_free_space(block_group, info); - if (info->bitmap) - kfree(info->bitmap); - kfree(info); - if (need_resched()) { - spin_unlock(&block_group->tree_lock); - cond_resched(); - spin_lock(&block_group->tree_lock); - } - } - - spin_unlock(&block_group->tree_lock); } u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size) + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; - spin_lock(&block_group->tree_lock); - entry = find_free_space(block_group, &offset, &bytes_search, 0); + spin_lock(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len, max_extent_size); if (!entry) goto out; ret = offset; if (entry->bitmap) { - bitmap_clear_bits(block_group, entry, offset, bytes); - if (!entry->bytes) { - unlink_free_space(block_group, entry); - kfree(entry->bitmap); - kfree(entry); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + bitmap_clear_bits(ctl, entry, offset, bytes); + if (!entry->bytes) + free_bitmap(ctl, entry); } else { - unlink_free_space(block_group, entry); - entry->offset += bytes; - entry->bytes -= bytes; + unlink_free_space(ctl, entry); + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; if (!entry->bytes) - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); else - link_free_space(block_group, entry); + link_free_space(ctl, entry); } - out: - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); + if (align_gap_len) + __btrfs_add_free_space(ctl, align_gap, align_gap_len); return ret; } @@ -967,6 +2330,7 @@ int btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster) { + struct btrfs_free_space_ctl *ctl; int ret; /* first, get a safe pointer to the block group */ @@ -985,10 +2349,12 @@ int btrfs_return_cluster_to_free_space( atomic_inc(&block_group->count); spin_unlock(&cluster->lock); + ctl = block_group->free_space_ctl; + /* now return any extents the cluster had on it */ - spin_lock(&block_group->tree_lock); + spin_lock(&ctl->tree_lock); ret = __btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&block_group->tree_lock); + spin_unlock(&ctl->tree_lock); /* finally drop our ref */ btrfs_put_block_group(block_group); @@ -997,48 +2363,28 @@ int btrfs_return_cluster_to_free_space( static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, - u64 bytes, u64 min_start) + struct btrfs_free_space *entry, + u64 bytes, u64 min_start, + u64 *max_extent_size) { - struct btrfs_free_space *entry; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int err; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; - spin_lock(&block_group->tree_lock); - spin_lock(&cluster->lock); - - if (!cluster->points_to_bitmap) - goto out; - - if (cluster->block_group != block_group) - goto out; - - /* - * search_start is the beginning of the bitmap, but at some point it may - * be a good idea to point to the actual start of the free area in the - * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only - * to 1 to make sure we get the bitmap entry - */ - entry = tree_search_offset(block_group, - offset_to_bitmap(block_group, search_start), - 1, 0); - if (!entry || !entry->bitmap) - goto out; - search_start = min_start; search_bytes = bytes; - err = search_bitmap(block_group, entry, &search_start, - &search_bytes); - if (err) - goto out; + err = search_bitmap(ctl, entry, &search_start, &search_bytes); + if (err) { + if (search_bytes > *max_extent_size) + *max_extent_size = search_bytes; + return 0; + } ret = search_start; - bitmap_clear_bits(block_group, entry, ret, bytes); -out: - spin_unlock(&cluster->lock); - spin_unlock(&block_group->tree_lock); + __bitmap_clear_bits(ctl, entry, ret, bytes); return ret; } @@ -1050,16 +2396,13 @@ out: */ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start) + u64 min_start, u64 *max_extent_size) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; - if (cluster->points_to_bitmap) - return btrfs_alloc_from_bitmap(block_group, cluster, bytes, - min_start); - spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -1072,11 +2415,12 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); + while (1) { + if (entry->bytes < bytes && entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; - while(1) { - if (entry->bytes < bytes || entry->offset < min_start) { - struct rb_node *node; - + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { node = rb_next(&entry->offset_index); if (!node) break; @@ -1084,50 +2428,83 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } - ret = entry->offset; - entry->offset += bytes; - entry->bytes -= bytes; + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + cluster->window_start, + max_extent_size); + if (ret == 0) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + cluster->window_start += bytes; + } else { + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + } - if (entry->bytes == 0) { + if (entry->bytes == 0) rb_erase(&entry->offset_index, &cluster->root); - kfree(entry); - } break; } out: spin_unlock(&cluster->lock); + if (!ret) + return 0; + + spin_lock(&ctl->tree_lock); + + ctl->free_space -= bytes; + if (entry->bytes == 0) { + ctl->free_extents--; + if (entry->bitmap) { + kfree(entry->bitmap); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); + } + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&ctl->tree_lock); + return ret; } static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; - bool found = false; + int ret; - i = offset_to_bit(entry->offset, block_group->sectorsize, + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); - total_bits = bytes_to_bits(bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); again: found_bits = 0; - for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -1135,77 +2512,218 @@ again: } if (!found_bits) - return -1; + return -ENOSPC; - if (!found) { + if (!total_found) { start = i; - found = true; + cluster->max_size = 0; } total_found += found_bits; - if (cluster->max_size < found_bits * block_group->sectorsize) - cluster->max_size = found_bits * block_group->sectorsize; + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } - cluster->window_start = start * block_group->sectorsize + - entry->offset; - cluster->points_to_bitmap = true; + cluster->window_start = start * ctl->unit + entry->offset; + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + ASSERT(!ret); /* -EEXIST; Logic error */ + trace_btrfs_setup_cluster(block_group, cluster, + total_found * ctl->unit, 1); return 0; } /* + * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. + */ +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *first = NULL; + struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *last; + struct rb_node *node; + u64 window_free; + u64 max_extent; + u64 total_size = 0; + + entry = tree_search_offset(ctl, offset, 0, 1); + if (!entry) + return -ENOSPC; + + /* + * We don't want bitmaps, so just move along until we find a normal + * extent entry. + */ + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + window_free = entry->bytes; + max_extent = entry->bytes; + first = entry; + last = entry; + + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + continue; + } + + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + + cluster->window_start = first->offset; + + node = &first->offset_index; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + */ + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (entry->bitmap || entry->bytes < min_bytes) + continue; + + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); + total_size += entry->bytes; + ASSERT(!ret); /* -EEXIST; Logic error */ + } while (node && entry != last); + + cluster->max_size = max_extent; + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); + + if (ctl->total_bitmaps == 0) + return -ENOSPC; + + /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, cont1_bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. + */ + return -ENOSPC; +} + +/* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *entry = NULL; - struct rb_node *node; - struct btrfs_free_space *next; - struct btrfs_free_space *last = NULL; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; - u64 window_start; - u64 window_free; - u64 max_extent = 0; - bool found_bitmap = false; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } + + spin_lock(&ctl->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (ctl->free_space < bytes) { + spin_unlock(&ctl->tree_lock); + return -ENOSPC; + } - spin_lock(&block_group->tree_lock); spin_lock(&cluster->lock); /* someone already found a cluster, hooray */ @@ -1213,153 +2731,547 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, ret = 0; goto out; } -again: - entry = tree_search_offset(block_group, offset, found_bitmap, 1); - if (!entry) { - ret = -ENOSPC; - goto out; + + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + INIT_LIST_HEAD(&bitmaps); + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes + empty_size, + cont1_bytes, min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes + empty_size, + cont1_bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); + + if (!ret) { + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); } +out: + spin_unlock(&cluster->lock); + spin_unlock(&ctl->tree_lock); - /* - * If found_bitmap is true, we exhausted our search for extent entries, - * and we just want to search all of the bitmaps that we can find, and - * ignore any extent entries we find. - */ - while (entry->bitmap || found_bitmap || - (!entry->bitmap && entry->bytes < min_bytes)) { - struct rb_node *node = rb_next(&entry->offset_index); - - if (entry->bitmap && entry->bytes > bytes + empty_size) { - ret = btrfs_bitmap_cluster(block_group, entry, cluster, - offset, bytes + empty_size, - min_bytes); - if (!ret) - goto got_it; + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root = RB_ROOT; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes) +{ + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_fs_info *fs_info = block_group->fs_info; + int ret; + int update = 0; + u64 trimmed = 0; + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; + + while (start < end) { + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; } - if (!node) { - ret = -ENOSPC; - goto out; + entry = tree_search_offset(ctl, start, 0, 1); + if (!entry) { + spin_unlock(&ctl->tree_lock); + break; } - entry = rb_entry(node, struct btrfs_free_space, offset_index); - } - /* - * We already searched all the extent entries from the passed in offset - * to the end and didn't find enough space for the cluster, and we also - * didn't find any bitmaps that met our criteria, just go ahead and exit - */ - if (found_bitmap) { - ret = -ENOSPC; - goto out; + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { + spin_unlock(&ctl->tree_lock); + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes); + if (ret) + break; +next: + start += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); } +out: + return ret; +} - cluster->points_to_bitmap = false; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; - max_extent = entry->bytes; +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); - while (1) { - /* out window is just right, lets fill it */ - if (window_free >= bytes + empty_size) + while (offset < end) { + bool next_bitmap = false; + + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); break; + } - node = rb_next(&last->offset_index); - if (!node) { - if (found_bitmap) - goto again; - ret = -ENOSPC; - goto out; + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; } - next = rb_entry(node, struct btrfs_free_space, offset_index); - /* - * we found a bitmap, so if this search doesn't result in a - * cluster, we know to go and search again for the bitmaps and - * start looking for space there - */ - if (next->bitmap) { - if (!found_bitmap) - offset = next->offset; - found_bitmap = true; - last = next; - continue; + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (next->offset - (last->offset + last->bytes) > 128 * 1024 || - next->offset - window_start > (bytes + empty_size) * 2) { - entry = next; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; - max_extent = entry->bytes; + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; } else { - last = next; - window_free += next->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; } + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + return ret; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + + return ret; +} + +/* + * Find the left-most item in the cache tree, and then return the + * smallest inode number in the item. + * + * Note: the returned inode number may not be the smallest one in + * the tree, if the left-most item is a bitmap. + */ +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +{ + struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; + struct btrfs_free_space *entry = NULL; + u64 ino = 0; + + spin_lock(&ctl->tree_lock); + + if (RB_EMPTY_ROOT(&ctl->free_space_offset)) + goto out; + + entry = rb_entry(rb_first(&ctl->free_space_offset), + struct btrfs_free_space, offset_index); + + if (!entry->bitmap) { + ino = entry->offset; + + unlink_free_space(ctl, entry); + entry->offset++; + entry->bytes--; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } else { + u64 offset = 0; + u64 count = 1; + int ret; + + ret = search_bitmap(ctl, entry, &offset, &count); + /* Logic error; Should be empty if it can't find anything */ + ASSERT(!ret); + + ino = offset; + bitmap_clear_bits(ctl, entry, offset, 1); + if (entry->bytes == 0) + free_bitmap(ctl, entry); } +out: + spin_unlock(&ctl->tree_lock); - cluster->window_start = entry->offset; + return ino; +} + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct inode *inode = NULL; + + spin_lock(&root->cache_lock); + if (root->cache_inode) + inode = igrab(root->cache_inode); + spin_unlock(&root->cache_lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, 0); + if (IS_ERR(inode)) + return inode; + + spin_lock(&root->cache_lock); + if (!btrfs_fs_closing(root->fs_info)) + root->cache_inode = igrab(inode); + spin_unlock(&root->cache_lock); + + return inode; +} + +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + return __create_free_space_inode(root, trans, path, + BTRFS_FREE_INO_OBJECTID, 0); +} + +int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + int ret = 0; + u64 root_gen = btrfs_root_generation(&root->root_item); + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; /* - * now we've found our entries, pull them out of the free space - * cache and put them into the cluster rbtree - * - * The cluster includes an rbtree, but only uses the offset index - * of each free space cache entry. + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. */ - while (1) { - node = rb_next(&entry->offset_index); - if (entry->bitmap && node) { - entry = rb_entry(node, struct btrfs_free_space, - offset_index); - continue; - } else if (entry->bitmap && !node) { - break; - } + if (btrfs_fs_closing(fs_info)) + return 0; - rb_erase(&entry->offset_index, &block_group->free_space_offset); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); - BUG_ON(ret); + path = btrfs_alloc_path(); + if (!path) + return 0; - if (!node || entry == last) - break; + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode)) + goto out; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - } + if (root_gen != BTRFS_I(inode)->generation) + goto out_put; - cluster->max_size = max_extent; -got_it: - ret = 0; - atomic_inc(&block_group->count); - list_add_tail(&cluster->block_group_list, &block_group->cluster_list); - cluster->block_group = block_group; + ret = __load_free_space_cache(root, inode, ctl, path, 0); + + if (ret < 0) + btrfs_err(fs_info, + "failed to load free ino cache for root %llu", + root->root_key.objectid); +out_put: + iput(inode); out: - spin_unlock(&cluster->lock); - spin_unlock(&block_group->tree_lock); + btrfs_free_path(path); + return ret; +} + +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + int ret; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free ino cache for root %llu", + root->root_key.objectid); +#endif + } return ret; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* - * simple code to zero out a cluster + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. */ -void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) { - spin_lock_init(&cluster->lock); - spin_lock_init(&cluster->refill_lock); - cluster->root = RB_ROOT; - cluster->max_size = 0; - cluster->points_to_bitmap = false; - INIT_LIST_HEAD(&cluster->block_group_list); - cluster->block_group = NULL; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; + +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + } + + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (map) + kfree(map); + return 0; } +/* + * Checks to see if the given range is in the free space cache. This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&ctl->tree_lock); + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) + goto out; + } + +have_info: + if (info->bitmap) { + u64 bit_off, bit_bytes; + struct rb_node *n; + struct btrfs_free_space *tmp; + + bit_off = offset; + bit_bytes = ctl->unit; + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + if (!ret) { + if (bit_off == offset) { + ret = 1; + goto out; + } else if (bit_off > offset && + offset + bytes > bit_off) { + ret = 1; + goto out; + } + } + + n = rb_prev(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (tmp->offset + tmp->bytes < offset) + break; + if (offset + bytes < tmp->offset) { + n = rb_prev(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + n = rb_next(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (offset + bytes < tmp->offset) + break; + if (tmp->offset + tmp->bytes < offset) { + n = rb_next(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + goto out; + } + + if (info->offset == offset) { + ret = 1; + goto out; + } + + if (offset > info->offset && offset < info->offset + info->bytes) + ret = 1; +out: + spin_unlock(&ctl->tree_lock); + return ret; +} +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 890a8e79011..0cf4977ef70 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -27,27 +27,98 @@ struct btrfs_free_space { struct list_head list; }; -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); +struct btrfs_free_space_ctl { + spinlock_t tree_lock; + struct rb_root free_space_offset; + u64 free_space; + int extents_thresh; + int free_extents; + int total_bitmaps; + int unit; + u64 start; + struct btrfs_free_space_op *op; + void *private; +}; + +struct btrfs_free_space_op { + void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); + bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +}; + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path); +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct inode *inode); +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path); +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path); +int load_free_ino_cache(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode); + +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 bytenr, u64 size); +static inline int +btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->free_space_ctl, + bytenr, size); +} int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); + *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size); + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size); +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start); + u64 min_start, u64 *max_extent_size); int btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); + +/* Support functions for runnint our sanity tests */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap); +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes); +#endif + #endif diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c new file mode 100644 index 00000000000..85889aa82c6 --- /dev/null +++ b/fs/btrfs/hash.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <crypto/hash.h> +#include <linux/err.h> +#include "hash.h" + +static struct crypto_shash *tfm; + +int __init btrfs_hash_init(void) +{ + tfm = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + return 0; +} + +void btrfs_hash_exit(void) +{ + crypto_free_shash(tfm); +} + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm)]; + } desc; + int err; + + desc.shash.tfm = tfm; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index db2ff9773b9..118a2316e5d 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,9 +19,24 @@ #ifndef __HASH__ #define __HASH__ -#include <linux/crc32c.h> +int __init btrfs_hash_init(void); + +void btrfs_hash_exit(void); + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); + static inline u64 btrfs_name_hash(const char *name, int len) { - return crc32c((u32)~1, name, len); + return btrfs_crc32c((u32)~1, name, len); } + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) btrfs_crc32c(parent_objectid, name, len); +} + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 72ce3c173d6..2be38df703c 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -18,7 +18,9 @@ #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" +#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) @@ -49,10 +51,149 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, + const char *name, int name_len, + struct btrfs_inode_extref **extref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_extref *extref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + /* + * Search all extended backrefs in this item. We're only + * looking through any collisions so most of the time this is + * just going to compare against one buffer. If all is well, + * we'll return success and the inode ref object. + */ + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_ptr = (unsigned long)(&extref->name); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (ref_name_len == name_len && + btrfs_inode_extref_parent(leaf, extref) == ref_objectid && + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { + if (extref_ret) + *extref_ret = extref; + return 1; + } + + cur_offset += ref_name_len + sizeof(*extref); + } + return 0; +} + +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) +{ + int ret; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + return NULL; + return extref; +} + +static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, + u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + int ret; + int del_len = name_len + sizeof(*extref); + unsigned long ptr; + unsigned long item_start; + u32 item_size; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + /* + * Sanity check - did we find the right item for this name? + * This should always succeed so error here will make the FS + * readonly. + */ + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, &extref)) { + btrfs_std_error(root->fs_info, -ENOENT); + ret = -EROFS; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + + if (del_len == item_size) { + /* + * Common case only one ref in the item, remove the + * whole item. + */ + ret = btrfs_del_item(trans, root, path); + goto out; + } + + ptr = (unsigned long)extref; + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + + memmove_extent_buffer(leaf, ptr, ptr + del_len, + item_size - (ptr + del_len - item_start)); + + btrfs_truncate_item(root, path, item_size - del_len, 1); + +out: + btrfs_free_path(path); + + return ret; +} + int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index) + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; struct btrfs_key key; @@ -63,6 +204,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 item_size; u32 sub_item_len; int ret; + int search_ext_refs = 0; int del_len = name_len + sizeof(*ref); key.objectid = inode_objectid; @@ -78,12 +220,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; + search_ext_refs = 1; goto out; } else if (ret < 0) { goto out; } if (!find_name_in_backref(path, name, name_len, &ref)) { ret = -ENOENT; + search_ext_refs = 1; goto out; } leaf = path->nodes[0]; @@ -101,14 +245,84 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - ret = btrfs_truncate_item(trans, root, path, - item_size - sub_item_len, 1); - BUG_ON(ret); + btrfs_truncate_item(root, path, item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + + if (search_ext_refs) { + /* + * No refs were found, or we could not find the + * name in our ref array. Find and remove the extended + * inode ref then. + */ + return btrfs_del_inode_extref(trans, root, name, name_len, + inode_objectid, ref_objectid, index); + } + + return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_inode_extref *extref; + int ret; + int ins_len = name_len + sizeof(*extref); + unsigned long ptr; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_item *item; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, NULL)) + goto out; + + btrfs_extend_item(root, path, ins_len); + ret = 0; + } + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_nr(path->slots[0]); + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); + ptr += btrfs_item_size(leaf, item) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_index(path->nodes[0], extref, index); + btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + + ptr = (unsigned long)&extref->name; + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + out: btrfs_free_path(path); return ret; } +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -139,8 +353,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ret = btrfs_extend_item(trans, root, path, ins_len); - BUG_ON(ret); + btrfs_extend_item(root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -164,6 +377,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); + + if (ret == -EMLINK) { + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + /* We ran out of space in the ref array. Need to + * add an extended ref. */ + if (btrfs_super_incompat_flags(disk_super) + & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + ret = btrfs_insert_inode_extref(trans, root, name, + name_len, + inode_objectid, + ref_objectid, index); + } + return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index c56eb590917..888fbe19079 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -16,11 +16,497 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> + #include "ctree.h" #include "disk-io.h" +#include "free-space-cache.h" +#include "inode-map.h" #include "transaction.h" -int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +static int caching_kthread(void *data) +{ + struct btrfs_root *root = data; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u64 last = (u64)-1; + int slot; + int ret; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Since the commit root is read-only, we can safely skip locking. */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 2; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->commit_root_sem); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + if (btrfs_fs_closing(fs_info)) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + if (WARN_ON(btrfs_header_nritems(leaf) == 0)) + break; + + /* + * Save the key so we can advances forward + * in the next search. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(path); + root->cache_progress = last; + up_read(&fs_info->commit_root_sem); + schedule_timeout(1); + goto again; + } else + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_INODE_ITEM_KEY) + goto next; + + if (key.objectid >= root->highest_objectid) + break; + + if (last != (u64)-1 && last + 1 != key.objectid) { + __btrfs_add_free_space(ctl, last + 1, + key.objectid - last - 1); + wake_up(&root->cache_wait); + } + + last = key.objectid; +next: + path->slots[0]++; + } + + if (last < root->highest_objectid - 1) { + __btrfs_add_free_space(ctl, last + 1, + root->highest_objectid - last - 1); + } + + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + + root->cache_progress = (u64)-1; + btrfs_unpin_free_ino(root); +out: + wake_up(&root->cache_wait); + up_read(&fs_info->commit_root_sem); + + btrfs_free_path(path); + + return ret; +} + +static void start_caching(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct task_struct *tsk; + int ret; + u64 objectid; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_NO) { + spin_unlock(&root->cache_lock); + return; + } + + root->cached = BTRFS_CACHE_STARTED; + spin_unlock(&root->cache_lock); + + ret = load_free_ino_cache(root->fs_info, root); + if (ret == 1) { + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + return; + } + + /* + * It can be quite time-consuming to fill the cache by searching + * through the extent tree, and this can keep ino allocation path + * waiting. Therefore at start we quickly find out the highest + * inode number and we know we can use inode numbers which fall in + * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. + */ + ret = btrfs_find_free_objectid(root, &objectid); + if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { + __btrfs_add_free_space(ctl, objectid, + BTRFS_LAST_FREE_OBJECTID - objectid + 1); + } + + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", + root->root_key.objectid); + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } +} + +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) +{ + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + +again: + *objectid = btrfs_find_ino_for_alloc(root); + + if (*objectid != 0) + return 0; + + start_caching(root); + + wait_event(root->cache_wait, + root->cached == BTRFS_CACHE_FINISHED || + root->free_ino_ctl->free_space > 0); + + if (root->cached == BTRFS_CACHE_FINISHED && + root->free_ino_ctl->free_space == 0) + return -ENOSPC; + else + goto again; +} + +void btrfs_return_ino(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; +again: + if (root->cached == BTRFS_CACHE_FINISHED) { + __btrfs_add_free_space(pinned, objectid, 1); + } else { + down_write(&root->fs_info->commit_root_sem); + spin_lock(&root->cache_lock); + if (root->cached == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->cache_lock); + up_write(&root->fs_info->commit_root_sem); + goto again; + } + spin_unlock(&root->cache_lock); + + start_caching(root); + + __btrfs_add_free_space(pinned, objectid, 1); + + up_write(&root->fs_info->commit_root_sem); + } +} + +/* + * When a transaction is committed, we'll move those inode numbers which + * are smaller than root->cache_progress from pinned tree to free_ino tree, + * and others will just be dropped, because the commit root we were + * searching has changed. + * + * Must be called with root->fs_info->commit_root_sem held + */ +void btrfs_unpin_free_ino(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + struct btrfs_free_space *info; + struct rb_node *n; + u64 count; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + while (1) { + n = rb_first(rbroot); + if (!n) + break; + + info = rb_entry(n, struct btrfs_free_space, offset_index); + BUG_ON(info->bitmap); /* Logic error */ + + if (info->offset > root->cache_progress) + goto free; + else if (info->offset + info->bytes > root->cache_progress) + count = root->cache_progress - info->offset + 1; + else + count = info->bytes; + + __btrfs_add_free_space(ctl, info->offset, count); +free: + rb_erase(&info->offset_index, rbroot); + kfree(info); + } +} + +#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) + +/* + * The goal is to keep the memory used by the free_ino tree won't + * exceed the memory if we use bitmaps only. + */ +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int max_ino; + int max_bitmaps; + + n = rb_last(&ctl->free_space_offset); + if (!n) { + ctl->extents_thresh = INIT_THRESHOLD; + return; + } + info = rb_entry(n, struct btrfs_free_space, offset_index); + + /* + * Find the maximum inode number in the filesystem. Note we + * ignore the fact that this can be a bitmap, because we are + * not doing precise calculation. + */ + max_ino = info->bytes - 1; + + max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; + if (max_bitmaps <= ctl->total_bitmaps) { + ctl->extents_thresh = 0; + return; + } + + ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * + PAGE_CACHE_SIZE / sizeof(*info); +} + +/* + * We don't fall back to bitmap, if we are below the extents threshold + * or this chunk of inode numbers is a big one. + */ +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + if (ctl->free_extents < ctl->extents_thresh || + info->bytes > INODES_PER_BITMAP / 10) + return false; + + return true; +} + +static struct btrfs_free_space_op free_ino_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) +{ +} + +static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + /* + * We always use extents for two reasons: + * + * - The pinned tree is only used during the process of caching + * work. + * - Make code simpler. See btrfs_unpin_free_ino(). + */ + return false; +} + +static struct btrfs_free_space_op pinned_free_ino_op = { + .recalc_thresholds = pinned_recalc_thresholds, + .use_bitmap = pinned_use_bitmap, +}; + +void btrfs_init_free_ino_ctl(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = 1; + ctl->start = 0; + ctl->private = NULL; + ctl->op = &free_ino_op; + + /* + * Initially we allow to use 16K of ram to cache chunks of + * inode numbers before we resort to bitmaps. This is somewhat + * arbitrary, but it will be adjusted in runtime. + */ + ctl->extents_thresh = INIT_THRESHOLD; + + spin_lock_init(&pinned->tree_lock); + pinned->unit = 1; + pinned->start = 0; + pinned->private = NULL; + pinned->extents_thresh = 0; + pinned->op = &pinned_free_ino_op; +} + +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; + u64 alloc_hint = 0; + int ret; + int prealloc; + bool retry = false; + + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0) + return 0; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + trans->bytes_reserved, + BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + trans->transid, trans->bytes_reserved, 1); +again: + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { + ret = PTR_ERR(inode); + goto out_release; + } + + if (IS_ERR(inode)) { + BUG_ON(retry); /* Logic error */ + retry = true; + + ret = create_free_ino_inode(root, trans, path); + if (ret) + goto out_release; + goto again; + } + + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, inode); + if (ret) { + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } + } + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_FINISHED) { + ret = -1; + spin_unlock(&root->cache_lock); + goto out_put; + } + spin_unlock(&root->cache_lock); + + spin_lock(&ctl->tree_lock); + prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; + prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + spin_unlock(&ctl->tree_lock); + + /* Just to make sure we have enough space */ + prealloc += 8 * PAGE_CACHE_SIZE; + + ret = btrfs_delalloc_reserve_space(inode, prealloc); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, + prealloc, prealloc, &alloc_hint); + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); + goto out_put; + } + btrfs_free_reserved_data_space(inode, prealloc); + + ret = btrfs_write_out_ino_cache(root, trans, path, inode); +out_put: + iput(inode); +out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + trans->transid, trans->bytes_reserved, 0); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); +out: + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; + + btrfs_free_path(path); + return ret; +} + +static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -30,7 +516,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) int slot; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; search_key.objectid = BTRFS_LAST_FREE_OBJECTID; search_key.type = -1; @@ -38,7 +525,7 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; @@ -54,15 +541,14 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 dirid, u64 *objectid) +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_inode(root, &root->highest_objectid); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); if (ret) goto out; } diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h new file mode 100644 index 00000000000..ddb347bfee2 --- /dev/null +++ b/fs/btrfs/inode-map.h @@ -0,0 +1,13 @@ +#ifndef __BTRFS_INODE_MAP +#define __BTRFS_INODE_MAP + +void btrfs_init_free_ino_ctl(struct btrfs_root *root); +void btrfs_unpin_free_ino(struct btrfs_root *root); +void btrfs_return_ino(struct btrfs_root *root, u64 objectid); +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans); + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bfdc641d4e..3668048e16f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,27 +32,36 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> -#include "compat.h" +#include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/btrfs.h> +#include <linux/blkdev.h> +#include <linux/posix_acl_xattr.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "backref.h" +#include "hash.h" +#include "props.h" struct btrfs_iget_args { - u64 ino; + struct btrfs_key *location; struct btrfs_root *root; }; @@ -67,9 +76,11 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -82,21 +93,30 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static void btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); +static int btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, u64 ram_bytes, + int type); + +static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(trans, inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err; } @@ -105,13 +125,13 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, + int compress_type, struct page **compressed_pages) { - struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -120,34 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, int err = 0; int ret; size_t cur_size = size; - size_t datasize; unsigned long offset; - int use_compress = 0; - if (compressed_size && compressed_pages) { - use_compress = 1; + if (compressed_size && compressed_pages) cur_size = compressed_size; - } - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + inode_add_bytes(inode, size); - path->leave_spinning = 1; - btrfs_set_trans_block_group(trans, inode); + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; - key.objectid = inode->i_ino; - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - BUG_ON(ret); - if (ret) { - err = ret; - goto fail; + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -159,7 +174,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); - if (use_compress) { + if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { @@ -167,28 +182,28 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap_atomic(cpage, KM_USER0); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); i++; ptr += cur_size; compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - BTRFS_COMPRESS_ZLIB); + compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); offset = start & (PAGE_CACHE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_release_path(path); /* * we're an inline extent, so nobody can @@ -200,11 +215,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); - return 0; + return ret; fail: - btrfs_free_path(path); return err; } @@ -214,20 +228,22 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, u64 start, u64 end, - size_t compressed_size, - struct page **compressed_pages) +static noinline int cow_file_range_inline(struct btrfs_root *root, + struct inode *inode, u64 start, + u64 end, size_t compressed_size, + int compress_type, + struct page **compressed_pages) { + struct btrfs_trans_handle *trans; u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - u64 hint_byte; + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -242,18 +258,53 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, inode, start, aligned_end, - &hint_byte, 1); - BUG_ON(ret); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, inline_len, compressed_size, - compressed_pages); - BUG_ON(ret); + compress_type, compressed_pages); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); - return 0; +out: + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + return ret; } struct async_extent { @@ -262,6 +313,7 @@ struct async_extent { u64 compressed_size; struct page **pages; unsigned long nr_pages; + int compress_type; struct list_head list; }; @@ -279,16 +331,19 @@ static noinline int add_async_extent(struct async_cow *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, + int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -307,7 +362,8 @@ static noinline int add_async_extent(struct async_cow *cow, * If this code finds it can't get good compression, it puts an * entry onto the work queue to write the uncompressed bytes. This * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that pdflush sent them down. + * are written in the same order that the flusher thread sent them + * down. */ static noinline int compress_file_range(struct inode *inode, struct page *locked_page, @@ -316,10 +372,7 @@ static noinline int compress_file_range(struct inode *inode, int *num_added) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 num_bytes; - u64 orig_start; - u64 disk_num_bytes; u64 blocksize = root->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -333,8 +386,21 @@ static noinline int compress_file_range(struct inode *inode, unsigned long max_uncompressed = 128 * 1024; int i; int will_compress; + int compress_type = root->fs_info->compress_type; + int redirty = 0; + + /* if this is a small write inside eof, kick off a defrag */ + if ((end - start + 1) < 16 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode); - orig_start = start; + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if ((end - start + 1) <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; actual_end = min_t(u64, isize, end + 1); again: @@ -368,9 +434,8 @@ again: * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; total_in = 0; ret = 0; @@ -381,16 +446,36 @@ again: */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress))) { + (BTRFS_I(inode)->force_compress) || + (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } - ret = btrfs_zlib_compress_pages(inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); if (!ret) { unsigned long offset = total_compressed & @@ -402,50 +487,46 @@ again: * sending it down to disk */ if (offset) { - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } will_compress = 1; } } +cont: if (start == 0) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, + 0, 0, NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, - total_compressed, pages); + ret = cow_file_range_inline(root, inode, start, end, + total_compressed, + compress_type, pages); } - if (ret == 0) { + if (ret <= 0) { + unsigned long clear_flags = EXTENT_DELALLOC | + EXTENT_DEFRAG; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + /* - * inline extent creation worked, we don't need - * to create any more async work items. Unlock - * and free up our temp pages. + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - - btrfs_end_transaction(trans, root); + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); goto free_pages_out; } - btrfs_end_transaction(trans, root); } if (will_compress) { @@ -454,19 +535,16 @@ again: * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { - disk_num_bytes = total_compressed; num_bytes = total_in; } } @@ -498,9 +576,10 @@ again: * and will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret); + total_compressed, pages, nr_pages_ret, + compress_type); - if (start + num_bytes < end && start + num_bytes < actual_end) { + if (start + num_bytes < end) { start += num_bytes; pages = NULL; cond_resched(); @@ -520,12 +599,15 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; } out: - return 0; + return ret; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -548,7 +630,6 @@ static noinline int submit_compressed_extents(struct inode *inode, { struct async_extent *async_extent; u64 alloc_hint = 0; - struct btrfs_trans_handle *trans; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -559,7 +640,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - +again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, struct async_extent, list); @@ -575,7 +656,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->ram_size - 1); /* allocate blocks */ ret = cow_file_range(inode, async_cow->locked_page, @@ -584,6 +665,8 @@ retry: async_extent->ram_size - 1, &page_started, &nr_written, 0); + /* JDM XXX */ + /* * if page_started, cow_file_range inserted an * inline extent and took care of all the unlocking @@ -597,25 +680,23 @@ retry: async_extent->ram_size - 1, btrfs_get_extent, WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); kfree(async_extent); cond_resched(); continue; } lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1, - GFP_NOFS); + async_extent->start + async_extent->ram_size - 1); - trans = btrfs_join_transaction(root, 1); - ret = btrfs_reserve_extent(trans, root, + ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, - (u64)-1, &ins, 1); - btrfs_end_transaction(trans, root); - + 0, alloc_hint, &ins, 1, 1); if (ret) { int i; + for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); page_cache_release(async_extent->pages[i]); @@ -623,10 +704,14 @@ retry: kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); - goto retry; + + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + goto retry; + } + goto out_free; } /* @@ -637,20 +722,30 @@ retry: async_extent->start + async_extent->ram_size - 1, 0); - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto out_free_reserve; + } em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = async_extent->ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -661,40 +756,87 @@ retry: async_extent->ram_size - 1, 0); } - ret = btrfs_add_ordered_extent(inode, async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED); - BUG_ON(ret); + if (ret) + goto out_free_reserve; + + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); + if (ret) + goto out_free_reserve; /* * clear dirty, set writeback and unlock the pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); - + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); - - BUG_ON(ret); alloc_hint = ins.objectid + ins.offset; kfree(async_extent); + if (ret) + goto out; cond_resched(); } + ret = 0; +out: + return ret; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_free: + extent_clear_unlock_delalloc(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + kfree(async_extent); + goto again; +} - return 0; +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; } /* @@ -717,106 +859,91 @@ static noinline int cow_file_range(struct inode *inode, int unlock) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; - u64 actual_end; - u64 isize = i_size_read(inode); struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - - actual_end = min_t(u64, isize, end + 1); + if (btrfs_is_free_space_inode(inode)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out_unlock; + } - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - ret = 0; + + /* if this is a small write inside eof, kick off defrag */ + if (num_bytes < 64 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, 0, 0, + NULL); if (ret == 0) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; - ret = 0; goto out; + } else if (ret < 0) { + goto out_unlock; } } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); - + btrfs_super_total_bytes(root->fs_info->super_copy)); - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - BUG_ON(ret); + &ins, 1, 1); + if (ret < 0) + goto out_unlock; - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto out_reserve; + } em->start = start; em->orig_start = em->start; ram_size = ins.offset; em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -825,17 +952,21 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } + if (ret) + goto out_reserve; cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); + if (ret) + goto out_reserve; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - BUG_ON(ret); + if (ret) + goto out_reserve; } if (disk_num_bytes < cur_alloc_size) @@ -848,23 +979,30 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; - op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2; + op = unlock ? PAGE_UNLOCK : 0; + op |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, start + ram_size - 1, - locked_page, op); + extent_clear_unlock_delalloc(inode, start, + start + ram_size - 1, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } out: - ret = 0; - btrfs_end_transaction(trans, root); - return ret; + +out_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); +out_unlock: + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DELALLOC | EXTENT_DEFRAG, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + goto out; } /* @@ -879,8 +1017,10 @@ static noinline void async_cow_start(struct btrfs_work *work) compress_file_range(async_cow->inode, async_cow->locked_page, async_cow->start, async_cow->end, async_cow, &num_added); - if (num_added == 0) + if (num_added == 0) { + btrfs_add_delayed_iput(async_cow->inode); async_cow->inode = NULL; + } } /* @@ -898,10 +1038,8 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; - atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); - - if (atomic_read(&root->fs_info->async_delalloc_pages) < - 5 * 1042 * 1024 && + if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < + 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -913,6 +1051,8 @@ static noinline void async_cow_free(struct btrfs_work *work) { struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); + if (async_cow->inode) + btrfs_add_delayed_iput(async_cow->inode); kfree(async_cow); } @@ -924,13 +1064,14 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1042; + int limit = 10 * 1024 * 1024; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - async_cow->inode = inode; + BUG_ON(!async_cow); /* -ENOMEM */ + async_cow->inode = igrab(inode); async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; @@ -943,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - async_cow->work.func = async_cow_start; - async_cow->work.ordered_func = async_cow_submit; - async_cow->work.ordered_free = async_cow_free; - async_cow->work.flags = 0; + btrfs_init_work(&async_cow->work, async_cow_start, + async_cow_submit, async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); - btrfs_queue_worker(&root->fs_info->delalloc_workers, - &async_cow->work); + btrfs_queue_work(root->fs_info->delalloc_workers, + &async_cow->work); if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { wait_event(root->fs_info->async_submit_wait, @@ -983,7 +1122,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, LIST_HEAD(list); ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, - bytenr + num_bytes - 1, &list); + bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; @@ -1019,28 +1158,61 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 extent_offset; u64 disk_bytenr; u64 num_bytes; + u64 disk_num_bytes; + u64 ram_bytes; int extent_type; - int ret; + int ret, err; int type; int nocow; int check_prev = 1; + bool nolock; + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - BUG_ON(!path); - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + if (!path) { + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + return -ENOMEM; + } + + nolock = btrfs_is_free_space_inode(inode); + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_free_path(path); + return PTR_ERR(trans); + } + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; cur_offset = start; while (1) { - ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - BUG_ON(ret < 0); + if (ret < 0) + goto error; if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); - if (found_key.objectid == inode->i_ino && + if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } @@ -1050,7 +1222,7 @@ next_slot: if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - BUG_ON(1); + goto error; if (ret > 0) break; leaf = path->nodes[0]; @@ -1061,7 +1233,7 @@ next_slot: num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid > inode->i_ino || + if (found_key.objectid > ino || found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; @@ -1076,12 +1248,15 @@ next_slot: struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); + disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); if (extent_end <= start) { path->slots[0]++; goto next_slot; @@ -1096,7 +1271,7 @@ next_slot: goto out_check; if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; - if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + if (btrfs_cross_ref_exist(trans, root, ino, found_key.offset - extent_offset, disk_bytenr)) goto out_check; @@ -1104,6 +1279,15 @@ next_slot: disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_nocow_write(root); + if (!err) + goto out_check; + } + /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. @@ -1113,7 +1297,8 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); extent_end = ALIGN(extent_end, root->sectorsize); } else { BUG_ON(1); @@ -1121,6 +1306,8 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto next_slot; } if (!nocow) { @@ -1133,12 +1320,16 @@ out_check: goto next_slot; } - btrfs_release_path(root, path); + btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, - found_key.offset - 1, page_started, - nr_written, 1); - BUG_ON(ret); + ret = cow_file_range(inode, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); + goto error; + } cow_start = (u64)-1; } @@ -1146,17 +1337,24 @@ out_check: struct extent_map *em; struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); + BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; - em->orig_start = em->start; + em->orig_start = found_key.offset - extent_offset; em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; + em->orig_block_len = disk_num_bytes; + em->ram_bytes = ram_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1172,31 +1370,59 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); + goto error; + } + } - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2); + extent_clear_unlock_delalloc(inode, cur_offset, + cur_offset + num_bytes - 1, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC, PAGE_UNLOCK | + PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_nocow_write(root); cur_offset = extent_end; if (cur_offset > end) break; } - btrfs_release_path(root, path); + btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) + if (cur_offset <= end && cow_start == (u64)-1) { cow_start = cur_offset; + cur_offset = end; + } + if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) + goto error; } - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); +error: + err = btrfs_end_transaction(trans, root); + if (!ret) + ret = err; + + if (ret && cur_offset < end) + extent_clear_unlock_delalloc(inode, cur_offset, end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); btrfs_free_path(path); - return 0; + return ret; } /* @@ -1209,33 +1435,36 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress)) + } else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); - else + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); + } return ret; } -static int btrfs_split_extent_hook(struct inode *inode, +static void btrfs_split_extent_hook(struct inode *inode, struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) - return 0; + return; - spin_lock(&BTRFS_I(inode)->accounting_lock); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - - return 0; + spin_unlock(&BTRFS_I(inode)->lock); } /* @@ -1244,19 +1473,57 @@ static int btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static int btrfs_merge_extent_hook(struct inode *inode, - struct extent_state *new, - struct extent_state *other) +static void btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) { /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) - return 0; + return; - spin_lock(&BTRFS_I(inode)->accounting_lock); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&BTRFS_I(inode)->lock); +} - return 0; +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); } /* @@ -1264,98 +1531,99 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static void btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore - * but in this case, we are only testeing for the DELALLOC + * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; + bool do_list = !btrfs_is_free_space_inode(inode); - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); - - spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + if (*bits & EXTENT_FIRST_DELALLOC) { + *bits &= ~EXTENT_FIRST_DELALLOC; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); } - spin_unlock(&root->fs_info->delalloc_lock); + + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes += len; + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); + spin_unlock(&BTRFS_I(inode)->lock); } - return 0; } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) +static void btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, + unsigned long *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore - * but in this case, we are only testeing for the DELALLOC + * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; + bool do_list = !btrfs_is_free_space_inode(inode); - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); + if (*bits & EXTENT_FIRST_DELALLOC) { + *bits &= ~EXTENT_FIRST_DELALLOC; + } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } - if (BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - } - spin_unlock(&root->fs_info->delalloc_lock); + /* + * We don't reserve metadata space for space cache inodes so we + * don't need to call dellalloc_release_metadata if there is an + * error. + */ + if (*bits & EXTENT_DO_ACCOUNTING && + root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && do_list && !(state->state & EXTENT_NORESERVE)) + btrfs_free_reserved_data_space(inode, len); + + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes -= len; + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); + spin_unlock(&BTRFS_I(inode)->lock); } - return 0; } /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_mapping_tree *map_tree; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -1363,12 +1631,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; + length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(map_tree, READ, logical, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); - + /* Will always return 0 with map_multi == NULL */ + BUG_ON(ret < 0); if (map_length < length + size) return 1; return 0; @@ -1384,13 +1652,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -1403,10 +1672,16 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; - return btrfs_map_bio(root, rw, bio, mirror_num, 1); + int ret; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } /* @@ -1414,37 +1689,60 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; int skip_sum; + int metadata = 0; + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + if (btrfs_is_free_space_inode(inode)) + metadata = 2; + + if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + goto out; - if (!(rw & (1 << BIO_RW))) { if (bio_flags & EXTENT_BIO_COMPRESSED) { - return btrfs_submit_compressed_read(inode, bio, - mirror_num, bio_flags); - } else if (!skip_sum) - btrfs_lookup_bio_sums(root, inode, bio, NULL); + ret = btrfs_submit_compressed_read(inode, bio, + mirror_num, + bio_flags); + goto out; + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + goto out; + } goto mapit; - } else if (!skip_sum) { + } else if (async && !skip_sum) { /* csum items have already been cloned */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); + goto out; + } else if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + if (ret) + goto out; } mapit: - return btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: + if (ret < 0) + bio_endio(bio, ret); + return ret; } /* @@ -1457,11 +1755,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_sum *sum; - btrfs_set_trans_block_group(trans, inode); - list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } @@ -1469,8 +1767,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) - WARN_ON(1); + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } @@ -1490,6 +1787,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; + int ret; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; @@ -1505,7 +1803,7 @@ again: page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, - &cached_state, GFP_NOFS); + &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) @@ -1517,17 +1815,28 @@ again: page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); goto again; } + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); + set_page_dirty(page); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_page: unlock_page(page); page_cache_release(page); + kfree(fixup); } /* @@ -1560,10 +1869,10 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - fixup->work.func = btrfs_writepage_fixup_worker; + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EAGAIN; + btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); + return -EBUSY; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -1578,13 +1887,12 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; - u64 hint; + int extent_inserted = 0; int ret; path = btrfs_alloc_path(); - BUG_ON(!path); - - path->leave_spinning = 1; + if (!path) + return -ENOMEM; /* * we may be replacing one extent in the tree with another. @@ -1595,15 +1903,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, - &hint, 0); - BUG_ON(ret); + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); + if (ret) + goto out; - ins.objectid = inode->i_ino; - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - BUG_ON(ret); + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1618,10 +1934,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); - btrfs_unlock_up_safe(path, 1); - btrfs_set_lock_blocking(leaf); - btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); inode_add_bytes(inode, num_bytes); @@ -1630,266 +1944,910 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, - inode->i_ino, file_pos, &ins); - BUG_ON(ret); + btrfs_ino(inode), file_pos, &ins); +out: btrfs_free_path(path); + return ret; +} + +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ return 0; } +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON(ret < 0)) + return ret; + ret = 0; + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, + path, record_one_backref, + old); + if (ret < 0 && ret != -ENOENT) + return false; + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + struct new_sa_defrag_extent *new) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) + return 0; + + if (btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + /* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. + * Note the backref might has changed, and in this case we just return 0. */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + + if (btrfs_root_readonly(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + path->leave_spinning = 1; + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (extent_len + found_key.offset == start && + relink_is_mergable(leaf, fi, new)) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + btrfs_release_path(path); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + path->leave_spinning = 0; + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ + struct old_sa_defrag_extent *old, *tmp; + + if (!new) + return; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } + kfree(new); +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); +out: + free_sa_defrag_extent(new); + + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_path; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_path; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_path: + btrfs_free_path(path); +out_kfree: + free_sa_defrag_extent(new); + return NULL; +} + +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, + u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { + struct inode *inode = ordered_extent->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - int compressed = 0; - int ret; + struct new_sa_defrag_extent *new = NULL; + int compress_type = 0; + int ret = 0; + u64 logical_len = ordered_extent->len; + bool nolock; + bool truncated = false; - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1); - if (!ret) - return 0; - BUG_ON(!ordered_extent); + nolock = btrfs_is_free_space_inode(inode); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; + /* Truncated the entire extent, don't bother adding */ + if (!logical_len) + goto out; + } if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - trans = btrfs_join_transaction(root, 1); - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); goto out; } lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); + + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (0 && last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); - trans = btrfs_join_transaction(root, 1); + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_unlock; + } + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compressed = 1; + compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compressed); + BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + - ordered_extent->len); - BUG_ON(ret); + logical_len); } else { + BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, ordered_extent->disk_len, - ordered_extent->len, - ordered_extent->len, - compressed, 0, 0, + logical_len, logical_len, + compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, - ordered_extent->len); - BUG_ON(ret); + if (!ret) + btrfs_release_delalloc_bytes(root, + ordered_extent->start, + ordered_extent->disk_len); + } + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, ordered_extent->len, + trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } + ret = 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); + + if (ret || truncated) { + u64 start, end; + + if (truncated) + start = ordered_extent->file_offset + logical_len; + else + start = ordered_extent->file_offset; + end = ordered_extent->file_offset + ordered_extent->len - 1; + clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + + /* Drop the cache for the part of the extent we didn't write. */ + btrfs_drop_extent_cache(inode, start, end, 0); + + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. + */ + if ((ret || !logical_len) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len, 1); + } + + + /* + * This needs to be done to make sure anybody waiting knows we are done + * updating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + + /* for snapshot-aware defrag */ + if (new) { + if (ret) { + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); + } else { + relink_file_extents(new); + } + } + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - return 0; + return ret; } -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +static void finish_ordered_fn(struct btrfs_work *work) { - ClearPagePrivate2(page); - return btrfs_finish_ordered_io(page->mapping->host, start, end); + struct btrfs_ordered_extent *ordered_extent; + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); } -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) { - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_workqueue *workers; - if (!em || IS_ERR(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; - bio->bi_size = 0; + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & (1 << BIO_RW)) - rw = WRITE; - else - rw = READ; + ClearPagePrivate2(page); + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) + return 0; - BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags); - return 0; -} + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; + if (btrfs_is_free_space_inode(inode)) + workers = root->fs_info->endio_freespace_worker; + else + workers = root->fs_info->endio_write_workers; + btrfs_queue_work(workers, &ordered_extent->work); - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } return 0; } /* * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) +static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; - u64 private = ~(u32)0; - int ret; struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum_expected; u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); @@ -1897,7 +2855,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + goto good; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -1906,40 +2864,27 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; } - if (state && state->start == start) { - private = state->private; - ret = 0; - } else { - ret = get_state_private(io_tree, start, &private); - } - kaddr = kmap_atomic(page, KM_USER0); - if (ret) - goto zeroit; + phy_offset >>= inode->i_sb->s_blocksize_bits; + csum_expected = *(((u32 *)io_bio->csum) + phy_offset); - csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); btrfs_csum_final(csum, (char *)&csum); - if (csum != private) + if (csum != csum_expected) goto zeroit; - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we've recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); - } + if (__ratelimit(&_rs)) + btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(page->mapping->host), start, csum, csum_expected); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - if (private == 0) + kunmap_atomic(kaddr); + if (csum_expected == 0) return 0; return -EIO; } @@ -1949,6 +2894,8 @@ struct delayed_iput { struct inode *inode; }; +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -1978,7 +2925,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; - down_read(&root->fs_info->cleanup_work_sem); spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -1989,63 +2935,175 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } - up_read(&root->fs_info->cleanup_work_sem); +} + +/* + * This is called in transaction commit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + int ret; + + if (atomic_read(&root->orphan_inodes) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + spin_lock(&root->orphan_lock); + if (atomic_read(&root->orphan_inodes)) { + spin_unlock(&root->orphan_lock); + return; + } + + if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { + spin_unlock(&root->orphan_lock); + return; + } + + block_rsv = root->orphan_block_rsv; + root->orphan_block_rsv = NULL; + spin_unlock(&root->orphan_lock); + + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); + } + + if (block_rsv) { + WARN_ON(block_rsv->size > 0); + btrfs_free_block_rsv(root, block_rsv); + } } /* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; - spin_lock(&root->list_lock); + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!block_rsv) + return -ENOMEM; + } - /* already on the orphan list, we're good */ - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; } - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + atomic_inc(&root->orphan_inodes); + } - spin_unlock(&root->list_lock); + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + reserve = 1; + spin_unlock(&root->orphan_lock); - /* - * insert an orphan item to track this unlinked/truncated file - */ - ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + } - return ret; + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); + if (ret) { + atomic_dec(&root->orphan_inodes); + if (reserve) { + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + btrfs_orphan_release_metadata(inode); + } + if (ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + ret = 0; + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } + return 0; } /* * We have done the truncate/delete so we can go ahead and remove the orphan * item for this particular inode. */ -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, + struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; int ret = 0; - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) + delete_item = 1; - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; - } + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + release_rsv = 1; + spin_unlock(&root->orphan_lock); - list_del_init(&BTRFS_I(inode)->i_orphan); - if (!trans) { - spin_unlock(&root->list_lock); - return 0; + if (delete_item) { + atomic_dec(&root->orphan_inodes); + if (trans) + ret = btrfs_del_orphan_item(trans, root, + btrfs_ino(inode)); } - spin_unlock(&root->list_lock); - - ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + if (release_rsv) + btrfs_orphan_release_metadata(inode); return ret; } @@ -2054,21 +3112,24 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) * this cleans up any orphans that may be left on the list from the last use * of this root. */ -void btrfs_orphan_cleanup(struct btrfs_root *root) +int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_item *item; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; + u64 last_objectid = 0; int ret = 0, nr_unlink = 0, nr_truncate = 0; - if (!xchg(&root->clean_orphans, 0)) - return; + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; @@ -2077,18 +3138,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - printk(KERN_ERR "Error searching slot for orphan: %d" - "\n", ret); - break; - } + if (ret < 0) + goto out; /* * if ret == 0 means we found what we were searching for, which - * is weird, but possible, so only screw with path if we didnt + * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -2096,7 +3155,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* pull out the item */ leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ @@ -2106,60 +3164,152 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) break; /* release the path since we're done with it */ - btrfs_release_path(root, path); + btrfs_release_path(path); /* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item. */ + + if (found_key.offset == last_objectid) { + btrfs_err(root->fs_info, + "Error removing orphan entry, stopping orphan cleanup"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) - break; + ret = PTR_ERR_OR_ZERO(inode); + if (ret && ret != -ESTALE) + goto out; - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - spin_lock(&root->list_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->list_lock); + if (ret == -ESTALE && root == root->fs_info->tree_root) { + struct btrfs_root *dead_root; + struct btrfs_fs_info *fs_info = root->fs_info; + int is_dead_root = 0; + /* + * this is an orphan in the tree root. Currently these + * could come from 2 sources: + * a) a snapshot deletion in progress + * b) a free space cache inode + * We need to distinguish those two, as the snapshot + * orphan must not get deleted. + * find_dead_roots already ran before us, so if this + * is a snapshot deletion, we should find the root + * in the dead_roots list + */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry(dead_root, &fs_info->dead_roots, + root_list) { + if (dead_root->root_key.objectid == + found_key.objectid) { + is_dead_root = 1; + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (is_dead_root) { + /* prevent this orphan from being found again */ + key.offset = found_key.objectid - 1; + continue; + } + } /* - * if this is a bad inode, means we actually succeeded in - * removing the inode, but not the orphan record, which means - * we need to manually delete the orphan since iput will just - * do a destroy_inode + * Inode is already gone but the orphan item is still there, + * kill the orphan item. */ - if (is_bad_inode(inode)) { + if (ret == -ESTALE) { trans = btrfs_start_transaction(root, 1); - btrfs_orphan_del(trans, inode); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_debug(root->fs_info, "auto deleting %Lu", + found_key.objectid); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); btrfs_end_transaction(trans, root); - iput(inode); + if (ret) + goto out; continue; } + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); + /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { + if (WARN_ON(!S_ISREG(inode->i_mode))) { + iput(inode); + continue; + } nr_truncate++; - btrfs_truncate(inode); + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + iput(inode); + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) { + iput(inode); + goto out; + } + + ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); } else { nr_unlink++; } /* this will do delete_inode and everything for us */ iput(inode); + if (ret) + goto out; + } + /* release the path since we're done with it */ + btrfs_release_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + trans = btrfs_join_transaction(root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); } if (nr_unlink) - printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); if (nr_truncate) - printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); +out: + if (ret) + btrfs_crit(root->fs_info, + "could not do orphan cleanup %d", ret); btrfs_free_path(path); + return ret; } /* @@ -2169,13 +3319,24 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; int scanned = 0; + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + slot++; + *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -2184,8 +3345,13 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 0; /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } /* * we found a key greater than an xattr key, there can't @@ -2210,6 +3376,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, * something larger than an xattr. We have to assume the inode * has acls */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; return 1; } @@ -2224,13 +3392,21 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + unsigned long ptr; int maybe_acls; - u64 alloc_group_block; u32 rdev; int ret; + bool filled = false; + int first_xattr_slot; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + goto make_bad; + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -2238,13 +3414,16 @@ static void btrfs_read_locked_inode(struct inode *inode) goto make_bad; leaf = path->nodes[0]; + + if (filled) + goto cache_index; + inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); - inode->i_uid = btrfs_inode_uid(leaf, inode_item); - inode->i_gid = btrfs_inode_gid(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); tspec = btrfs_inode_atime(inode_item); @@ -2261,7 +3440,19 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + + /* + * If we were modified in the current generation and evicted from memory + * and then re-read we need to do a full sync since we don't have any + * idea about which extents were modified before we were evicted from + * cache. + */ + if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2269,21 +3460,50 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - alloc_group_block = btrfs_inode_block_group(leaf, inode_item); +cache_index: + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } +cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ - maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + maybe_acls = acls_after_inode_item(leaf, path->slots[0], + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + if (!maybe_acls) cache_no_acl(inode); - BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, - alloc_group_block, 0); - btrfs_free_path(path); - inode_item = NULL; - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; @@ -2326,40 +3546,47 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2368,20 +3595,21 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, int ret; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 1); + ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, + 1); if (ret) { if (ret > 0) ret = -ENOENT; goto failed; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_item); + struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, inode); btrfs_mark_buffer_dirty(leaf); @@ -2392,16 +3620,55 @@ failed: return ret; } +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + btrfs_update_root_times(trans, root); + + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { struct btrfs_path *path; int ret = 0; @@ -2409,15 +3676,17 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di; struct btrfs_key key; u64 index; + u64 ino = btrfs_ino(inode); + u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto err; + goto out; } path->leave_spinning = 1; - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -2432,91 +3701,154 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; - btrfs_release_path(root, path); + btrfs_release_path(path); - ret = btrfs_del_inode_ref(trans, root, name, name_len, - inode->i_ino, - dir->i_ino, &index); - if (ret) { - printk(KERN_INFO "btrfs failed to delete reference to %.*s, " - "inode %lu parent %lu\n", name_len, name, - inode->i_ino, dir->i_ino); - goto err; + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } } - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, + dir_ino, &index); + if (ret) { + btrfs_info(root->fs_info, + "failed to delete reference to %.*s, inode %llu parent %llu", + name_len, name, ino, dir_ino); + btrfs_abort_transaction(trans, root, ret); goto err; } - if (!di) { - ret = -ENOENT; +skip_backref: + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto err; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - btrfs_release_path(root, path); ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, - inode, dir->i_ino); - BUG_ON(ret != 0 && ret != -ENOENT); + inode, dir_ino); + if (ret != 0 && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); - BUG_ON(ret); + if (ret == -ENOENT) + ret = 0; + else if (ret) + btrfs_abort_transaction(trans, root, ret); err: btrfs_free_path(path); if (ret) goto out; btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(inode); + inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; - btrfs_update_inode(trans, root, dir); - btrfs_drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, dir); out: return ret; } -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { - struct btrfs_root *root; - struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; int ret; - unsigned long nr = 0; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} - root = BTRFS_I(dir)->root; +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret; /* - * 5 items for unlink inode - * 1 for orphan + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 6); - return PTR_ERR(trans); + trans = btrfs_start_transaction(root, 5); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; + + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); + } + trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->bytes_reserved = num_bytes; } + return trans; +} - btrfs_set_trans_block_group(trans, dir); +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + + trans = __unlink_start_trans(dir); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); + if (ret) + goto out; - if (inode->i_nlink == 0) + if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + } - nr = trans->blocks_used; - - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 6); - btrfs_btree_balance_dirty(root, nr); +out: + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); return ret; } @@ -2531,84 +3863,92 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_key key; u64 index; int ret; + u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - BUG_ON(!di || IS_ERR(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + goto out; + } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + btrfs_release_path(path); ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, - dir->i_ino, &index, name, name_len); + dir_ino, &index, name, name_len); if (ret < 0) { - BUG_ON(ret != -ENOENT); - di = btrfs_search_dir_index_item(root, path, dir->i_ino, + if (ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); - BUG_ON(!di || IS_ERR(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(root, path); + btrfs_release_path(path); index = key.offset; } + btrfs_release_path(path); - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - index, name, name_len, -1); - BUG_ON(!di || IS_ERR(di)); - - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); - WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - dir->i_sb->s_dirt = 1; - + ret = btrfs_update_inode_fallback(trans, root, dir); + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: btrfs_free_path(path); - return 0; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; int err = 0; - int ret; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - unsigned long nr = 0; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + return -EPERM; - ret = btrfs_reserve_metadata_space(root, 5); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 5); + trans = __unlink_start_trans(dir); + if (IS_ERR(trans)) return PTR_ERR(trans); - } - btrfs_set_trans_block_group(trans, dir); - - if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, root, dir, BTRFS_I(inode)->location.objectid, dentry->d_name.name, @@ -2626,188 +3966,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 5); - btrfs_btree_balance_dirty(root, nr); + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); - if (ret && !err) - err = ret; return err; } -#if 0 -/* - * when truncating bytes in a file, it is possible to avoid reading - * the leaves that contain only checksum items. This can be the - * majority of the IO required to delete a large file, but it must - * be done carefully. - * - * The keys in the level just above the leaves are checked to make sure - * the lowest key in a given leaf is a csum key, and starts at an offset - * after the new size. - * - * Then the key for the next leaf is checked to make sure it also has - * a checksum item for the same file. If it does, we know our target leaf - * contains only checksum items, and it can be safely freed without reading - * it. - * - * This is just an optimization targeted at large files. It may do - * nothing. It will return 0 unless things went badly. - */ -static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct inode *inode, u64 new_size) -{ - struct btrfs_key key; - int ret; - int nritems; - struct btrfs_key found_key; - struct btrfs_key other_key; - struct btrfs_leaf_ref *ref; - u64 leaf_gen; - u64 leaf_start; - - path->lowest_level = 1; - key.objectid = inode->i_ino; - key.type = BTRFS_CSUM_ITEM_KEY; - key.offset = new_size; -again: - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (path->nodes[1] == NULL) { - ret = 0; - goto out; - } - ret = 0; - btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); - nritems = btrfs_header_nritems(path->nodes[1]); - - if (!nritems) - goto out; - - if (path->slots[1] >= nritems) - goto next_node; - - /* did we find a key greater than anything we want to delete? */ - if (found_key.objectid > inode->i_ino || - (found_key.objectid == inode->i_ino && found_key.type > key.type)) - goto out; - - /* we check the next key in the node to make sure the leave contains - * only checksum items. This comparison doesn't work if our - * leaf is the last one in the node - */ - if (path->slots[1] + 1 >= nritems) { -next_node: - /* search forward from the last key in the node, this - * will bring us into the next node in the tree - */ - btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); - - /* unlikely, but we inc below, so check to be safe */ - if (found_key.offset == (u64)-1) - goto out; - - /* search_forward needs a path with locks held, do the - * search again for the original key. It is possible - * this will race with a balance and return a path that - * we could modify, but this drop is just an optimization - * and is allowed to miss some leaves. - */ - btrfs_release_path(root, path); - found_key.offset++; - - /* setup a max key for search_forward */ - other_key.offset = (u64)-1; - other_key.type = key.type; - other_key.objectid = key.objectid; - - path->keep_locks = 1; - ret = btrfs_search_forward(root, &found_key, &other_key, - path, 0, 0); - path->keep_locks = 0; - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { - ret = 0; - goto out; - } - - key.offset = found_key.offset; - btrfs_release_path(root, path); - cond_resched(); - goto again; - } - - /* we know there's one more slot after us in the tree, - * read that key so we can verify it is also a checksum item - */ - btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); - - if (found_key.objectid < inode->i_ino) - goto next_key; - - if (found_key.type != key.type || found_key.offset < new_size) - goto next_key; - - /* - * if the key for the next leaf isn't a csum key from this objectid, - * we can't be sure there aren't good items inside this leaf. - * Bail out - */ - if (other_key.objectid != inode->i_ino || other_key.type != key.type) - goto out; - - leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); - leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); - /* - * it is safe to delete this leaf, it contains only - * csum items from this inode at an offset >= new_size - */ - ret = btrfs_del_leaf(trans, root, path, leaf_start); - BUG_ON(ret); - - if (root->ref_cows && leaf_gen < trans->transid) { - ref = btrfs_alloc_leaf_ref(root, 0); - if (ref) { - ref->root_gen = root->root_key.offset; - ref->bytenr = leaf_start; - ref->owner = 0; - ref->generation = leaf_gen; - ref->nritems = 0; - - btrfs_sort_leaf_ref(ref); - - ret = btrfs_add_leaf_ref(root, ref, 0); - WARN_ON(ret); - btrfs_free_leaf_ref(root, ref); - } else { - WARN_ON(1); - } - } -next_key: - btrfs_release_path(root, path); - - if (other_key.objectid == inode->i_ino && - other_key.type == key.type && other_key.offset > key.offset) { - key.offset = other_key.offset; - cond_resched(); - goto again; - } - ret = 0; -out: - /* fixup any changes we've made to the path */ - path->lowest_level = 0; - path->keep_locks = 0; - btrfs_release_path(root, path); - return ret; -} - -#endif - /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -2833,27 +3997,44 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; + u64 last_size = (u64)-1; u32 found_type = (u8)-1; int found_extent; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; - int encoding; int ret; int err = 0; + u64 ino = btrfs_ino(inode); BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - if (root->ref_cows) - btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); - path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->reada = -1; - key.objectid = inode->i_ino; + /* + * We want to drop from the next block forward in case this new size is + * not block aligned since we will be keeping the last block of the + * extent just the way it is. + */ + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); + + /* + * This function is also used to drop the items in the log tree before + * we relog the inode, so if root != BTRFS_I(inode)->root, it means + * it is used to drop the loged items. So we shouldn't kill the delayed + * items. + */ + if (min_type == 0 && root == BTRFS_I(inode)->root) + btrfs_kill_delayed_inode_items(inode); + + key.objectid = ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -2879,9 +4060,8 @@ search_again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - encoding = 0; - if (found_key.objectid != inode->i_ino) + if (found_key.objectid != ino) break; if (found_type < min_type) @@ -2892,16 +4072,12 @@ search_again: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - encoding = btrfs_file_extent_compression(leaf, fi); - encoding |= btrfs_file_extent_encryption(leaf, fi); - encoding |= btrfs_file_extent_other_encoding(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, - fi); + path->slots[0], fi); } item_end--; } @@ -2920,21 +4096,27 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item && !encoding) { + if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (root->ref_cows && extent_start != 0) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) inode_sub_bytes(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -2948,7 +4130,8 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) inode_sub_bytes(inode, num_dec); } } @@ -2963,16 +4146,20 @@ search_again: btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - ret = btrfs_truncate_item(trans, root, path, - size, 1); - BUG_ON(ret); - } else if (root->ref_cows) { + btrfs_truncate_item(root, path, size, 1); + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); } @@ -2994,12 +4181,14 @@ delete: } else { break; } - if (found_extent && root->ref_cows) { + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - inode->i_ino, extent_offset); + ino, extent_offset, 0); BUG_ON(ret); } @@ -3008,18 +4197,18 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot) { - if (root->ref_cows) { - err = -EAGAIN; - goto out; - } if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, ret); + goto error; + } pending_del_nr = 0; } - btrfs_release_path(root, path); + btrfs_release_path(path); goto search_again; } else { path->slots[0]--; @@ -3029,18 +4218,31 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +error: + if (last_size != (u64)-1) + btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; } /* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + * offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero. This is used with truncate and hole punching. */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front) { - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -3050,26 +4252,23 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; u64 page_start; u64 page_end; - if ((offset & (blocksize - 1)) == 0) + if ((offset & (blocksize - 1)) == 0 && + (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; - ret = -ENOMEM; again: - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + ret = -ENOMEM; goto out; } @@ -3091,8 +4290,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -3107,7 +4305,8 @@ again: } clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -3118,10 +4317,14 @@ again: goto out_unlock; } - ret = 0; if (offset != PAGE_CACHE_SIZE) { + if (!len) + len = PAGE_CACHE_SIZE - offset; kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + if (front) + memset(kaddr, 0, offset); + else + memset(kaddr + offset, 0, len); flush_dcache_page(page); kunmap(page); } @@ -3132,43 +4335,100 @@ again: out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: return ret; } -int btrfs_cont_expand(struct inode *inode, loff_t size) +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) { struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +{ struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 mask = root->sectorsize - 1; - u64 hole_start = (inode->i_size + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; int err = 0; + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + if (size <= hole_start) return 0; while (1) { struct btrfs_ordered_extent *ordered; - btrfs_wait_ordered_range(inode, hole_start, - block_end - hole_start); + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, - &cached_state, GFP_NOFS); - ordered = btrfs_lookup_ordered_extent(inode, hole_start); + &cached_state); + ordered = btrfs_lookup_ordered_range(inode, hole_start, + block_end - hole_start); if (!ordered) break; unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -3176,197 +4436,381 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); + if (IS_ERR(em)) { + err = PTR_ERR(em); + em = NULL; + break; + } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - u64 hint_byte = 0; + struct extent_map *hole_em; hole_size = last_byte - cur_offset; - err = btrfs_reserve_metadata_space(root, 2); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); if (err) break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + hole_size - 1, 0); + hole_em = alloc_extent_map(); + if (!hole_em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + hole_em->start = cur_offset; + hole_em->len = hole_size; + hole_em->orig_start = cur_offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->orig_block_len = 0; + hole_em->ram_bytes = hole_size; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = root->fs_info->generation; - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - - err = btrfs_drop_extents(trans, inode, cur_offset, - cur_offset + hole_size, - &hint_byte, 1); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, - inode->i_ino, cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - BUG_ON(err); - - btrfs_drop_extent_cache(inode, hole_start, - last_byte - 1, 0); - - btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 2); + while (1) { + write_lock(&em_tree->lock); + err = add_extent_mapping(em_tree, hole_em, 1); + write_unlock(&em_tree->lock); + if (err != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + + hole_size - 1, 0); + } + free_extent_map(hole_em); } +next: free_extent_map(em); + em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - + free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); return err; } -static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - unsigned long nr; + loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; - if (attr->ia_size == inode->i_size) - return 0; - - if (attr->ia_size > inode->i_size) { - unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (attr->ia_size > inode->i_sb->s_maxbytes) - return -EFBIG; - if (limit != RLIM_INFINITY && attr->ia_size > limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); } - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - return ret; + if (newsize > oldsize) { + truncate_pagecache(inode, newsize); + ret = btrfs_cont_expand(inode, oldsize, newsize); + if (ret) + return ret; - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); - ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + } else { - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 1); - btrfs_btree_balance_dirty(root, nr); + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags); - if (attr->ia_size > inode->i_size) { - ret = btrfs_cont_expand(inode, attr->ia_size); - if (ret) { - btrfs_truncate(inode); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) return ret; - } - i_size_write(inode, attr->ia_size); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - if (inode->i_nlink > 0) { - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) { + int err; + + /* + * failed to truncate, disk_i_size is only adjusted down + * as we remove extents, so it should represent the true + * size of the inode, so reset the in memory size and + * delete our orphan entry. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + return ret; + } + i_size_write(inode, BTRFS_I(inode)->disk_i_size); + err = btrfs_orphan_del(trans, inode); + if (err) + btrfs_abort_transaction(trans, root, err); + btrfs_end_transaction(trans, root); } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return 0; } - /* - * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. - */ - if (attr->ia_size == 0) - BTRFS_I(inode)->ordered_data_close = 1; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - ret = vmtruncate(inode, attr->ia_size); - BUG_ON(ret); - - return 0; + return ret; } static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; int err; + if (btrfs_root_readonly(root)) + return -EROFS; + err = inode_change_ok(inode, attr); if (err) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setattr_size(inode, attr); + err = btrfs_setsize(inode, attr); if (err) return err; } - attr->ia_valid &= ~ATTR_SIZE; - if (attr->ia_valid) - err = inode_setattr(inode, attr); + if (attr->ia_valid) { + setattr_copy(inode, attr); + inode_inc_iversion(inode); + err = btrfs_dirty_inode(inode); + + if (!err && attr->ia_valid & ATTR_MODE) + err = posix_acl_chmod(inode, inode->i_mode); + } - if (!err && ((attr->ia_valid & ATTR_MODE))) - err = btrfs_acl_chmod(inode); return err; } -void btrfs_delete_inode(struct inode *inode) +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages_final(&inode->i_data); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + +void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long nr; + struct btrfs_block_rsv *rsv, *global_rsv; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); int ret; - truncate_inode_pages(&inode->i_data, 0); + trace_btrfs_inode_evict(inode); + + evict_inode_truncate_pages(inode); + + if (inode->i_nlink && + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(inode))) + goto no_delete; + if (is_bad_inode(inode)) { btrfs_orphan_del(NULL, inode); goto no_delete; } + /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)); goto no_delete; } if (inode->i_nlink > 0) { - BUG_ON(btrfs_root_refs(&root->root_item) != 0); + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); goto no_delete; } + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv->size = min_size; + rsv->failfast = 1; + global_rsv = &root->fs_info->global_block_rsv; + btrfs_i_size_write(inode, 0); + /* + * This is a bit simpler than btrfs_truncate since we've already + * reserved our space for our orphan item in the unlink, so we just + * need to reserve some slack space in case we add bytes and update + * inode item when doing the truncate. + */ while (1) { - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); - if (ret != -EAGAIN) + if (ret) { + btrfs_warn(root->fs_info, + "Could not get space for a delete, will truncate on mount %d", + ret); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans->block_rsv = rsv; + + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + if (ret != -ENOSPC) break; - nr = trans->blocks_used; + trans->block_rsv = &root->fs_info->trans_block_rsv; btrfs_end_transaction(trans, root); trans = NULL; - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } + btrfs_free_block_rsv(root, rsv); + + /* + * Errors here aren't a big deal, it just means we leave orphan items + * in the tree. They will be cleaned up on the next mount. + */ if (ret == 0) { - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + trans->block_rsv = root->orphan_block_rsv; + btrfs_orphan_del(trans, inode); + } else { + btrfs_orphan_del(NULL, inode); } - nr = trans->blocks_used; + trans->block_rsv = &root->fs_info->trans_block_rsv; + if (!(root == root->fs_info->tree_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) + btrfs_return_ino(root, btrfs_ino(inode)); + btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -3386,14 +4830,15 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, int ret = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, namelen, 0); if (IS_ERR(di)) ret = PTR_ERR(di); - if (!di || IS_ERR(di)) + if (IS_ERR_OR_NULL(di)) goto out_err; btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); @@ -3430,9 +4875,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); + ret = btrfs_find_item(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid, BTRFS_ROOT_REF_KEY, NULL); if (ret) { if (ret < 0) err = ret; @@ -3441,7 +4886,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) goto out; @@ -3451,7 +4896,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, if (ret) goto out; - btrfs_release_path(root->fs_info->tree_root, path); + btrfs_release_path(path); new_root = btrfs_read_fs_root_no_name(root->fs_info, location); if (IS_ERR(new_root)) { @@ -3459,11 +4904,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; @@ -3480,33 +4920,33 @@ static void inode_tree_add(struct inode *inode) struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; -again: - p = &root->inode_tree.rb_node; - parent = NULL; + struct rb_node *new = &BTRFS_I(inode)->rb_node; + u64 ino = btrfs_ino(inode); - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) return; - + parent = NULL; spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (inode->i_ino < entry->vfs_inode.i_ino) + if (ino < btrfs_ino(&entry->vfs_inode)) p = &parent->rb_left; - else if (inode->i_ino > entry->vfs_inode.i_ino) + else if (ino > btrfs_ino(&entry->vfs_inode)) p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING | I_CLEAR))); - rb_erase(parent, &root->inode_tree); + (I_WILL_FREE | I_FREEING))); + rb_replace_node(parent, new, &root->inode_tree); RB_CLEAR_NODE(parent); spin_unlock(&root->inode_lock); - goto again; + return; } } - rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); - rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + rb_link_node(new, parent, p); + rb_insert_color(new, &root->inode_tree); spin_unlock(&root->inode_lock); } @@ -3533,7 +4973,7 @@ static void inode_tree_del(struct inode *inode) } } -int btrfs_invalidate_inodes(struct btrfs_root *root) +void btrfs_invalidate_inodes(struct btrfs_root *root) { struct rb_node *node; struct rb_node *prev; @@ -3541,7 +4981,8 @@ int btrfs_invalidate_inodes(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - WARN_ON(btrfs_root_refs(&root->root_item) != 0); + if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: @@ -3551,9 +4992,9 @@ again: prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); - if (objectid < entry->vfs_inode.i_ino) + if (objectid < btrfs_ino(&entry->vfs_inode)) node = node->rb_left; - else if (objectid > entry->vfs_inode.i_ino) + else if (objectid > btrfs_ino(&entry->vfs_inode)) node = node->rb_right; else break; @@ -3561,7 +5002,7 @@ again: if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= entry->vfs_inode.i_ino) { + if (objectid <= btrfs_ino(&entry->vfs_inode)) { node = prev; break; } @@ -3570,14 +5011,14 @@ again: } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = entry->vfs_inode.i_ino + 1; + objectid = btrfs_ino(&entry->vfs_inode) + 1; inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); if (atomic_read(&inode->i_count) > 1) d_prune_aliases(inode); /* - * btrfs_drop_inode will remove it from + * btrfs_drop_inode will have it removed from * the inode cache when its usage count * hits zero. */ @@ -3593,65 +5034,37 @@ again: node = rb_next(node); } spin_unlock(&root->inode_lock); - return 0; -} - -static noinline void init_btrfs_i(struct inode *inode) -{ - struct btrfs_inode *bi = BTRFS_I(inode); - - bi->generation = 0; - bi->sequence = 0; - bi->last_trans = 0; - bi->last_sub_trans = 0; - bi->logged_trans = 0; - bi->delalloc_bytes = 0; - bi->reserved_bytes = 0; - bi->disk_i_size = 0; - bi->flags = 0; - bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; - bi->ordered_data_close = 0; - bi->force_compress = 0; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->log_mutex); } static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - init_btrfs_i(inode); + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); BTRFS_I(inode)->root = args->root; - btrfs_set_inode_space_info(args->root, inode); return 0; } static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == inode->i_ino && + return args->location->objectid == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, + struct btrfs_key *location, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - args.ino = objectid; + unsigned long hashval = btrfs_inode_hash(location->objectid, root); + + args.location = location; args.root = root; - inode = iget5_locked(s, objectid, btrfs_find_actor, + inode = iget5_locked(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -3665,19 +5078,22 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, { struct inode *inode; - inode = btrfs_iget_locked(s, location->objectid, root); + inode = btrfs_iget_locked(s, location, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - - inode_tree_add(inode); - unlock_new_inode(inode); - if (new) - *new = 1; + if (!is_bad_inode(inode)) { + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } else { + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(-ESTALE); + } } return inode; @@ -3692,14 +5108,12 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - init_btrfs_i(inode); - BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - BTRFS_I(inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -3714,20 +5128,17 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; int index; - int ret; - - dentry->d_op = &btrfs_dentry_operations; + int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ret = btrfs_inode_by_name(dir, dentry, &location); - if (ret < 0) return ERR_PTR(ret); if (location.objectid == 0) - return NULL; + return ERR_PTR(-ENOENT); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -3749,62 +5160,78 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); - if (root != sub_root) { + if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(sub_root); + ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); + if (ret) { + iput(inode); + inode = ERR_PTR(ret); + } } return inode; } -static int btrfs_dentry_delete(struct dentry *dentry) +static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; + struct inode *inode = dentry->d_inode; - if (!dentry->d_inode && !IS_ROOT(dentry)) - dentry = dentry->d_parent; + if (!inode && !IS_ROOT(dentry)) + inode = dentry->d_parent->d_inode; - if (dentry->d_inode) { - root = BTRFS_I(dentry->d_inode)->root; + if (inode) { + root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; } return 0; } +static void btrfs_dentry_release(struct dentry *dentry) +{ + kfree(dentry->d_fsdata); +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode; inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } - return d_splice_alias(inode, dentry); + return d_materialise_unique(dentry, inode); } -static unsigned char btrfs_filetype_table[] = { +unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int btrfs_real_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; + struct list_head ins_list; + struct list_head del_list; int ret; - u32 nritems; struct extent_buffer *leaf; int slot; - int advance; unsigned char d_type; int over = 0; u32 di_cur; @@ -3814,71 +5241,63 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, char tmp_name[32]; char *name_ptr; int name_len; + int is_curr = 0; /* ctx->pos points to the current index? */ /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) key_type = BTRFS_DIR_ITEM_KEY; - /* special case for "." */ - if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, - 1, inode->i_ino, - DT_DIR); - if (over) - return 0; - filp->f_pos = 1; - } - /* special case for .., just use the back ref */ - if (filp->f_pos == 1) { - u64 pino = parent_ino(filp->f_path.dentry); - over = filldir(dirent, "..", 2, - 2, pino, DT_DIR); - if (over) - return 0; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + return 0; + path = btrfs_alloc_path(); - path->reada = 2; + if (!path) + return -ENOMEM; + + path->reada = 1; + + if (key_type == BTRFS_DIR_INDEX_KEY) { + INIT_LIST_HEAD(&ins_list); + INIT_LIST_HEAD(&del_list); + btrfs_get_delayed_items(inode, &ins_list, &del_list); + } btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; - key.objectid = inode->i_ino; + key.offset = ctx->pos; + key.objectid = btrfs_ino(inode); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; - if (advance || slot >= nritems) { - if (slot >= nritems - 1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - slot++; - path->slots[0]++; - } + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) break; if (btrfs_key_type(&found_key) != key_type) break; - if (found_key.offset < filp->f_pos) - continue; + if (found_key.offset < ctx->pos) + goto next; + if (key_type == BTRFS_DIR_INDEX_KEY && + btrfs_should_delete_dir_index(&del_list, + found_key.offset)) + goto next; - filp->f_pos = found_key.offset; + ctx->pos = found_key.offset; + is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); di_cur = 0; @@ -3887,6 +5306,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + if (verify_dir_item(root, leaf, di)) + break; + name_len = btrfs_dir_name_len(leaf, di); if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; @@ -3903,17 +5325,23 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); + /* is this a reference to our own snapshot? If so - * skip it + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. */ if (location.type == BTRFS_ROOT_ITEM_KEY && location.objectid == root->root_key.objectid) { over = 0; goto skip; } - over = filldir(dirent, name_ptr, name_len, - found_key.offset, location.objectid, - d_type); + over = !dir_emit(ctx, name_ptr, name_len, + location.objectid, d_type); skip: if (name_ptr != tmp_name) @@ -3926,20 +5354,49 @@ skip: di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); } +next: + path->slots[0]++; + } + + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (is_curr) + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + if (ret) + goto nopos; } /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - filp->f_pos = 0x7fffffff; - else - filp->f_pos++; + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } nopos: ret = 0; err: + if (key_type == BTRFS_DIR_INDEX_KEY) + btrfs_put_delayed_items(&ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -3949,13 +5406,21 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret = 0; + bool nolock = false; - if (root->fs_info->btree_inode == inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) + nolock = true; + if (wbc->sync_mode == WB_SYNC_ALL) { - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); } return ret; @@ -3967,15 +5432,57 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -void btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + int ret; + + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + return 0; - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); + } btrfs_end_transaction(trans, root); + if (BTRFS_I(inode)->delayed_node) + btrfs_balance_delayed_items(root); + + return ret; +} + +/* + * This is a copy of file_update_time. We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +static int btrfs_update_time(struct inode *inode, struct timespec *now, + int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root)) + return -EROFS; + + if (flags & S_VERSION) + inode_inc_iversion(inode); + if (flags & S_CTIME) + inode->i_ctime = *now; + if (flags & S_MTIME) + inode->i_mtime = *now; + if (flags & S_ATIME) + inode->i_atime = *now; + return btrfs_dirty_inode(inode); } /* @@ -3991,7 +5498,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) struct extent_buffer *leaf; int ret; - key.objectid = inode->i_ino; + key.objectid = btrfs_ino(inode); btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); key.offset = (u64)-1; @@ -4023,7 +5530,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != inode->i_ino || + if (found_key.objectid != btrfs_ino(inode) || btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { BTRFS_I(inode)->index_cnt = 2; goto out; @@ -4044,9 +5551,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) int ret = 0; if (BTRFS_I(dir)->index_cnt == (u64)-1) { - ret = btrfs_set_inode_index_count(dir); - if (ret) - return ret; + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } } *index = BTRFS_I(dir)->index_cnt; @@ -4060,7 +5570,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct inode *dir, const char *name, int name_len, u64 ref_objectid, u64 objectid, - u64 alloc_hint, int mode, u64 *index) + umode_t mode, u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4069,81 +5579,99 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; - int owner; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return ERR_PTR(-ENOMEM); inode = new_inode(root->fs_info->sb); - if (!inode) + if (!inode) { + btrfs_free_path(path); return ERR_PTR(-ENOMEM); + } + + /* + * we have to initialize this early, so we can reclaim the inode + * number if we fail afterwards in this function. + */ + inode->i_ino = objectid; + + if (dir && name) { + trace_btrfs_inode_request(dir); - if (dir) { ret = btrfs_set_inode_index(dir, index); if (ret) { + btrfs_free_path(path); iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, * btrfs_get_inode_index_count has an explanation for the magic * number */ - init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; - btrfs_set_inode_space_info(root, inode); + inode->i_generation = BTRFS_I(inode)->generation; - if (mode & S_IFDIR) - owner = 0; - else - owner = 1; - BTRFS_I(inode)->block_group = - btrfs_find_block_group(root, 0, alloc_hint, owner); + /* + * We could have gotten an inode number from somebody who was fsynced + * and then removed in this same transaction, so let's just set full + * sync since it will be a full sync anyway and this will blow away the + * old info in the log. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; - inode->i_uid = current_fsuid(); - - if (dir && (dir->i_mode & S_ISGID)) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; - inode->i_ino = objectid; + inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, + sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); @@ -4155,18 +5683,31 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); - if ((mode & S_IFREG)) { + if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } - insert_inode_hash(inode); + btrfs_insert_inode_hash(inode); inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); + + btrfs_update_root_times(trans, root); + + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + return inode; fail: - if (dir) + if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); iput(inode); @@ -4191,58 +5732,81 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_key key; struct btrfs_root *root = BTRFS_I(parent_inode)->root; + u64 ino = btrfs_ino(inode); + u64 parent_ino = btrfs_ino(parent_inode); - if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); } else { - key.objectid = inode->i_ino; + key.objectid = ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; } - if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, key.objectid, root->root_key.objectid, - parent_inode->i_ino, - index, name, name_len); + parent_ino, index, name, name_len); } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, - name, name_len, inode->i_ino, - parent_inode->i_ino, index); + ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, + parent_ino, index); } - if (ret == 0) { - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode->i_ino, &key, - btrfs_inode_type(inode), index); - BUG_ON(ret); + /* Nothing to clean up yet */ + if (ret) + return ret; - btrfs_i_size_write(parent_inode, parent_inode->i_size + - name_len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + if (ret == -EEXIST || ret == -EOVERFLOW) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + inode_inc_iversion(parent_inode); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); } return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int backref, u64 index) + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); if (err > 0) err = -EEXIST; return err; } static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4250,7 +5814,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - unsigned long nr = 0; u64 index = 0; if (!new_valid_dev(rdev)) @@ -4261,68 +5824,63 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; - btrfs_set_trans_block_group(trans, dir); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; + err = btrfs_find_free_ino(root, &objectid); + if (err) goto out_unlock; - } inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, - BTRFS_I(dir)->block_group, mode, &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; + } - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + + inode->i_op = &btrfs_special_inode_operations; + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { - inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); + btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); return err; } static int btrfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, bool excl) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; + int drop_inode_on_err = 0; int err; - int drop_inode = 0; - unsigned long nr = 0; u64 objectid; u64 index = 0; @@ -4331,59 +5889,57 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; - btrfs_set_trans_block_group(trans, dir); + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; } + drop_inode_on_err = 1; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - objectid, BTRFS_I(dir)->block_group, mode, - &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); - if (err) { - drop_inode = 1; + err = btrfs_update_inode(trans, root, inode); + if (err) goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); + goto out_unlock; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); + out_unlock: - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); - if (drop_inode) { + btrfs_end_transaction(trans, root); + if (err && drop_inode_on_err) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); return err; } @@ -4394,60 +5950,73 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = old_dentry->d_inode; u64 index; - unsigned long nr = 0; int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) - return -EPERM; - - /* - * 1 item for inode ref - * 2 items for dir items - */ - err = btrfs_reserve_metadata_space(root, 3); - if (err) - return err; + return -EXDEV; - btrfs_inc_nlink(inode); + if (inode->i_nlink >= BTRFS_LINK_MAX) + return -EMLINK; err = btrfs_set_inode_index(dir, &index); if (err) goto fail; - trans = btrfs_start_transaction(root, 1); + /* + * 2 items for inode and inode ref + * 2 items for dir items + * 1 item for parent inode + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } - btrfs_set_trans_block_group(trans, dir); - atomic_inc(&inode->i_count); + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; + inc_nlink(inode); + inode_inc_iversion(inode); + inode->i_ctime = CURRENT_TIME; + ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); if (err) { drop_inode = 1; } else { - btrfs_update_inode_block_group(trans, dir); + struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); - BUG_ON(err); - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + if (err) + goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, inode, NULL, parent); } - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); fail: - btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -4456,35 +6025,23 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) int drop_on_err = 0; u64 objectid = 0; u64 index = 0; - unsigned long nr = 1; /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } - btrfs_set_trans_block_group(trans, dir); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; + err = btrfs_find_free_ino(root, &objectid); + if (err) goto out_fail; - } inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFDIR | mode, - &index); + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fail; @@ -4492,39 +6049,32 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_fail; inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) goto out_fail; - err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; d_instantiate(dentry, inode); drop_on_err = 0; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_fail: - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); - -out_unlock: - btrfs_unreserve_metadata_space(root, 5); + btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); - btrfs_btree_balance_dirty(root, nr); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); return err; } @@ -4548,7 +6098,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, em->block_start += start_diff; em->block_len -= start_diff; } - return add_extent_mapping(em_tree, em); + return add_extent_mapping(em_tree, em, 0); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -4562,29 +6112,25 @@ static noinline int uncompress_inline(struct btrfs_path *path, size_t max_size; unsigned long inline_size; unsigned long ptr; + int compress_type; WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); + btrfs_item_nr(path->slots[0])); tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; ptr = btrfs_file_extent_inline_start(item); read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_zlib_decompress(tmp, page, extent_offset, - inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page, KM_USER0); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr, KM_USER0); - } + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); kfree(tmp); - return 0; + return ret; } /* @@ -4602,10 +6148,9 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, { int ret; int err = 0; - u64 bytenr; u64 extent_start = 0; u64 extent_end = 0; - u64 objectid = inode->i_ino; + u64 objectid = btrfs_ino(inode); u32 found_type; struct btrfs_path *path = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4616,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compressed; + const bool new_inline = !page || create; again: read_lock(&em_tree->lock); @@ -4633,7 +6178,7 @@ again: else goto out; } - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); if (!em) { err = -ENOMEM; goto out; @@ -4646,7 +6191,15 @@ again: if (!path) { path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -4670,23 +6223,28 @@ again: found_type = btrfs_key_type(&found_key); if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { - goto not_found; + /* + * If we backup past the first extent we want to move forward + * and see if there is an extent in front of us, otherwise we'll + * say there is a hole for our whole search range which can + * cause problems. + */ + extent_end = start; + goto next; } found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compressed = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); + extent_end = ALIGN(extent_start + size, root->sectorsize); } - +next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -4706,33 +6264,15 @@ again: if (start + len <= found_key.offset) goto not_found; em->start = start; + em->orig_start = start; em->len = found_key.offset - start; goto not_found_em; } + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compressed) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->block_start = bytenr; - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -4741,31 +6281,28 @@ again: size_t extent_offset; size_t copy_size; - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; + if (new_inline) goto out; - } - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - em->orig_start = EXTENT_MAP_INLINE; - if (compressed) - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->len = ALIGN(copy_size, root->sectorsize); + em->orig_block_len = em->len; + em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) == - BTRFS_COMPRESS_ZLIB) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); + if (ret) { + err = ret; + goto out; + } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -4779,12 +6316,17 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + BUG(); if (!trans) { kunmap(page); free_extent_map(em); em = NULL; - btrfs_release_path(root, path); - trans = btrfs_join_transaction(root, 1); + + btrfs_release_path(path); + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) + return ERR_CAST(trans); goto again; } map = kmap(page); @@ -4794,33 +6336,28 @@ again: btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, GFP_NOFS); + extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; - } else { - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); - WARN_ON(1); } not_found: em->start = start; + em->orig_start = start; em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; set_bit(EXTENT_FLAG_VACANCY, &em->flags); insert: - btrfs_release_path(root, path); + btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { - printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " - "[%llu %llu]\n", (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); + btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", + em->start, em->len, start, len); err = -EIO; goto out; } err = 0; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -4861,6 +6398,9 @@ insert: } write_unlock(&em_tree->lock); out: + + trace_btrfs_get_extent(root, em); + if (path) btrfs_free_path(path); if (trans) { @@ -4872,27 +6412,1179 @@ out: free_extent_map(em); return ERR_PTR(err); } + BUG_ON(!em); /* Error is always set */ + return em; +} + +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start, range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map *em; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + alloc_hint, &ins, 1, 1); + if (ret) + return ERR_PTR(ret); + + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + return em; + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + free_extent_map(em); + return ERR_PTR(ret); + } + return em; } +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + goto out; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + backref_offset = btrfs_file_extent_offset(leaf, fi); + + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + num_bytes = min(offset + *len, extent_end) - offset; + if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + + btrfs_release_path(path); + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = 0; + goto out; + } + + ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + key.offset - backref_offset, disk_bytenr); + btrfs_end_transaction(trans, root); + if (ret) { + ret = 0; + goto out; + } + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + *len = num_bytes; + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, int writing) +{ + struct btrfs_ordered_extent *ordered; + int ret = 0; + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, cached_state); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) + break; + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state, GFP_NOFS); + + if (ordered) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } else { + /* Screw you mmap */ + ret = filemap_write_and_wait_range(inode->i_mapping, + lockstart, + lockend); + if (ret) + break; + + /* + * If we found a page that couldn't be invalidated just + * fall back to buffered. + */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + lockstart >> PAGE_CACHE_SHIFT, + lockend >> PAGE_CACHE_SHIFT); + if (ret) + break; + } + + cond_resched(); + } + + return ret; +} + +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, u64 ram_bytes, + int type) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + if (!em) + return ERR_PTR(-ENOMEM); + + em->start = start; + em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; + em->len = len; + em->block_len = block_len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->orig_block_len = orig_block_len; + em->ram_bytes = ram_bytes; + em->generation = -1; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + if (type == BTRFS_ORDERED_PREALLOC) + set_bit(EXTENT_FLAG_FILLING, &em->flags); + + do { + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + + return em; +} + + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + u64 start = iblock << inode->i_blkbits; + u64 lockstart, lockend; + u64 len = bh_result->b_size; + int unlock_bits = EXTENT_LOCKED; + int ret = 0; + + if (create) + unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + else + len = min_t(u64, len, root->sectorsize); + + lockstart = start; + lockend = start + len - 1; + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered. + */ + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) + return -ENOTBLK; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + ret = -ENOTBLK; + goto unlock_err; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + goto unlock_err; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = min(len, em->len - (start - em->start)); + lockstart = start + len; + goto unlock; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start, orig_start, orig_block_len, ram_bytes; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + em = create_pinned_em(inode, start, len, + orig_start, + block_start, len, + orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto unlock_err; + } + + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + if (ret) { + free_extent_map(em); + goto unlock_err; + } + goto unlock; + } + } + + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } + len = min(len, em->len - (start - em->start)); +unlock: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); + } + + /* + * In the case of write we need to clear and unlock the entire range, + * in the case of read we need to unlock only the end area that we + * aren't using if there is any left over space. + */ + if (lockstart < lockend) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); + } else { + free_extent_state(cached_state); + } + + free_extent_map(em); + + return 0; + +unlock_err: + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state, GFP_NOFS); + return ret; +} + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio_vec *bvec; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *dio_bio; + u32 *csums = (u32 *)dip->csum; + u64 start; + int i; + + start = dip->logical_offset; + bio_for_each_segment_all(bvec, bio, i) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != csums[i]) { + btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, + csums[i]); + err = -EIO; + } + } + + start += bvec->bv_len; + } + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1); + dio_bio = dip->dio_bio; + + kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; + int ret; + + if (err) + goto out_done; +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes, !err); + if (!ret) + goto out_test; + + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(root->fs_info->endio_write_workers, + &ordered->work); +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + ordered = NULL; + goto again; + } +out_done: + dio_bio = dip->dio_bio; + + kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); /* -ENOMEM */ + return 0; +} + +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + btrfs_err(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) { + bio_io_error(dip->orig_bio); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + int async_submit) +{ + struct btrfs_dio_private *dip = bio->bi_private; + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (async_submit) + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + + bio_get(bio); + + if (!write) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + } + + if (skip_sum) + goto map; + + if (write && async_submit) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (write) { + /* + * If we aren't doing async submit, calculate the csum of the + * bio now. + */ + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + if (ret) + goto err; + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, + file_offset); + if (ret) + goto err; + } + +map: + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_iter.bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + int ret = 0; + int async_submit = 0; + + map_length = orig_bio->bi_iter.bi_size; + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(orig_bio); + return -EIO; + } + + if (map_length >= orig_bio->bi_iter.bi_size) { + bio = orig_bio; + goto submit; + } + + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + async_submit = 0; + else + async_submit = 1; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_iter.bi_size; + ret = btrfs_map_block(root->fs_info, rw, + start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages++; + bvec++; + } + } + +submit: + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + async_submit); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio *io_bio; + int skip_sum; + int sum_len; + int write = rw & REQ_WRITE; + int ret = 0; + u16 csum_size; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + + if (!skip_sum && !write) { + csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; + sum_len *= csum_size; + } else { + sum_len = 0; + } + + dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_io_bio; + } + + dip->private = dio_bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + io_bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; + atomic_set(&dip->pending_bios, 0); + + if (write) + io_bio->bi_end_io = btrfs_endio_direct_write; + else + io_bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) + return; + +free_io_bio: + bio_put(io_bio); + +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len, 1); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(dio_bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iov_iter *iter, loff_t offset) +{ + int seg; + int i; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + + if (offset & blocksize_mask) + goto out; + + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; + + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + goto out; + } + } + retval = 0; +out: + return retval; +} + static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) + return 0; + + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic(); + + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. + */ + count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, count); + + if (rw & WRITE) { + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iter, offset, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + else + btrfs_delalloc_release_metadata(inode, 0); + } +out: + if (wakeup) + inode_dio_done(inode); + if (relock) + mutex_lock(&inode->i_mutex); + + return ret; } +#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + int ret; + + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + if (ret) + return ret; + + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent); + return extent_read_full_page(tree, page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -4909,8 +7601,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) return extent_write_full_page(tree, page, btrfs_get_extent, wbc); } -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -4951,14 +7643,16 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { + struct inode *inode = page->mapping->host; struct extent_io_tree *tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - + int inode_evicting = inode->i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -4969,41 +7663,65 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ wait_on_page_writeback(page); - tree = &BTRFS_I(page->mapping->host)->io_tree; + tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); - ordered = btrfs_lookup_ordered_extent(page->mapping->host, - page_offset(page)); + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* * IO on this page will never be started, so we need * to account for any ordered extents now */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, - &cached_state, GFP_NOFS); + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io */ if (TestClearPagePrivate2(page)) { - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + struct btrfs_ordered_inode_tree *tree; + u64 new_len; + + tree = &BTRFS_I(inode)->ordered_tree; + + spin_lock_irq(&tree->lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + new_len = page_start - ordered->file_offset; + if (new_len < ordered->truncated_len) + ordered->truncated_len = new_len; + spin_unlock_irq(&tree->lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + page_start, + PAGE_CACHE_SIZE, 1)) + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); if (PagePrivate(page)) { @@ -5031,7 +7749,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -5040,23 +7758,24 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long zero_start; loff_t size; int ret; + int reserved = 0; u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + sb_start_pagefault(inode->i_sb); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (!ret) { + ret = file_update_time(vma->vm_file); + reserved = 1; + } if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - goto out; - } - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; + if (reserved) + goto out; + goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -5068,14 +7787,12 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); /* @@ -5100,7 +7817,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -5109,7 +7827,6 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@ -5132,40 +7849,94 @@ again: BTRFS_I(inode)->last_trans = root->fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - if (!ret) + if (!ret) { + sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; + } unlock_page(page); out: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: + sb_end_pagefault(inode->i_sb); return ret; } -static void btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; + struct btrfs_block_rsv *rsv; + int ret = 0; + int err = 0; struct btrfs_trans_handle *trans; - unsigned long nr; u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); - return; - } - - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + (u64)-1); if (ret) - return; + return ret; - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) + return -ENOMEM; + rsv->size = min_size; + rsv->failfast = 1; + + /* + * 1 for the truncate slack space + * 1 for updating the inode. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); /* * setattr is responsible for setting the ordered_data_close flag, @@ -5184,39 +7955,76 @@ static void btrfs_truncate(struct inode *inode) * using truncate to replace the contents of the file will * end up with a zero length file after a crash. */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + trans->block_rsv = rsv; + while (1) { ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) + if (ret != -ENOSPC) { + err = ret; break; + } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret) { + err = ret; + break; + } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = err = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; } if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + if (ret) + err = ret; } - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - BUG_ON(ret); - btrfs_btree_balance_dirty(root, nr); + ret = btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + } + +out: + btrfs_free_block_rsv(root, rsv); + + if (ret && !err) + err = ret; + + return err; } /* @@ -5224,60 +8032,98 @@ static void btrfs_truncate(struct inode *inode) */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint) + struct btrfs_root *parent_root, + u64 new_dirid) { struct inode *inode; int err; u64 index = 0; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, alloc_hint, S_IFDIR | 0700, &index); + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d", + new_root->root_key.objectid, err); + err = btrfs_update_inode(trans, new_root, inode); - BUG_ON(err); iput(inode); - return 0; -} - -/* helper function for file defrag and space balancing. This - * forces readahead on a given range of bytes in an inode - */ -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index) -{ - pgoff_t req_size = last_index - offset + 1; - - page_cache_sync_readahead(mapping, ra, file, offset, req_size); - return offset + req_size; + return err; } struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; + struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + ei->root = NULL; + ei->generation = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; + ei->delalloc_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->csum_bytes = 0; + ei->index_cnt = (u64)-1; + ei->dir_index = 0; + ei->last_unlink_trans = 0; + ei->last_log_commit = 0; + + spin_lock_init(&ei->lock); ei->outstanding_extents = 0; ei->reserved_extents = 0; - ei->root = NULL; - spin_lock_init(&ei->accounting_lock); + + ei->runtime_flags = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; + + ei->delayed_node = NULL; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree); + extent_io_tree_init(&ei->io_tree, &inode->i_data); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; + atomic_set(&ei->sync_writers, 0); + mutex_init(&ei->log_mutex); + mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); - INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); - return &ei->vfs_inode; + RB_CLEAR_NODE(&ei->rb_node); + + return inode; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_destroy_inode(struct inode *inode) +{ + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} +#endif + +static void btrfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } void btrfs_destroy_inode(struct inode *inode) @@ -5285,8 +8131,12 @@ void btrfs_destroy_inode(struct inode *inode) struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(BTRFS_I(inode)->outstanding_extents); + WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also @@ -5302,28 +8152,25 @@ void btrfs_destroy_inode(struct inode *inode) */ smp_mb(); if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } - spin_lock(&root->list_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", - inode->i_ino); - list_del_init(&BTRFS_I(inode)->i_orphan); + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { + btrfs_info(root->fs_info, "inode %llu still on the orphan list", + btrfs_ino(inode)); + atomic_dec(&root->orphan_inodes); } - spin_unlock(&root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { - printk(KERN_ERR "btrfs found ordered " - "extent %llu %llu on inode cleanup\n", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", + ordered->file_offset, ordered->len); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -5332,16 +8179,21 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + call_rcu(&inode->i_rcu, btrfs_i_callback); } -void btrfs_drop_inode(struct inode *inode) +int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) - generic_delete_inode(inode); + + if (root == NULL) + return 1; + + /* the snap/subvol tree is on deleting */ + if (btrfs_root_refs(&root->root_item) == 0) + return 1; else - generic_drop_inode(inode); + return generic_drop_inode(inode); } static void init_once(void *foo) @@ -5353,6 +8205,11 @@ static void init_once(void *foo) void btrfs_destroy_cachep(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); if (btrfs_inode_cachep) kmem_cache_destroy(btrfs_inode_cachep); if (btrfs_trans_handle_cachep) @@ -5361,34 +8218,51 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_transaction_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) { - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -5398,12 +8272,19 @@ fail: static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); - stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -5419,33 +8300,42 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 index = 0; u64 root_objectid; int ret; + u64 old_ino = btrfs_ino(old_inode); - if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; /* we only allow rename subvolume link between subvolumes */ - if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || - (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) + if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) return -ENOTEMPTY; if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - ret = btrfs_reserve_metadata_space(root, 11); - if (ret) - return ret; + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, + new_dentry->d_name.name, + new_dentry->d_name.len); + + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (WARN_ON(!new_inode)) { + return ret; + } + } else { + /* maybe -EOVERFLOW */ + return ret; + } + } + ret = 0; /* * we're using rename to replace one file with another. @@ -5457,11 +8347,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ - if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&root->fs_info->subvol_sem); - - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, new_dir); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 11); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -5470,15 +8370,16 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; - if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + BTRFS_I(old_inode)->dir_index = 0ULL; + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, - old_inode->i_ino, - new_dir->i_ino, index); + old_ino, + btrfs_ino(new_dir), index); if (ret) goto out_fail; /* @@ -5494,11 +8395,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * make sure the inode gets flushed if it is replacing * something. */ - if (new_inode && new_inode->i_size && - old_inode && S_ISREG(old_inode->i_mode)) { + if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) btrfs_add_ordered_operation(trans, root, old_inode); - } + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -5506,23 +8408,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, old_dentry->d_name.name, old_dentry->d_name.len); } else { - btrfs_inc_nlink(old_dentry->d_inode); - ret = btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; } - BUG_ON(ret); if (new_inode) { + inode_inc_iversion(new_inode); new_inode->i_ctime = CURRENT_TIME; - if (unlikely(new_inode->i_ino == + if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; ret = btrfs_unlink_subvol(trans, dest, new_dir, @@ -5536,67 +8443,168 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - BUG_ON(ret); - if (new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, new_dentry->d_inode); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; } } ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; - if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); } out_fail: - btrfs_end_transaction_throttle(trans, root); - - if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_transaction(trans, root); +out_notrans: + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 11); return ret; } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + struct inode *inode; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + inode = delalloc_work->inode; + if (delalloc_work->wait) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, + int nr) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + struct list_head splice; + int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + INIT_LIST_HEAD(&works); + INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + mutex_lock(&root->delalloc_mutex); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) - list_del_init(&binode->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - filemap_flush(inode->i_mapping); + if (!inode) { + cond_resched_lock(&root->delalloc_lock); + continue; + } + spin_unlock(&root->delalloc_lock); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { if (delay_iput) btrfs_add_delayed_iput(inode); else iput(inode); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + ret++; + if (nr != -1 && ret >= nr) + goto out; cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); + } + spin_unlock(&root->delalloc_lock); + +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); } - spin_unlock(&root->fs_info->delalloc_lock); - /* the filemap_flush will queue IO into the worker threads, but + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + mutex_unlock(&root->delalloc_mutex); + return ret; +} + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return -EROFS; + + ret = __start_delalloc_inodes(root, delay_iput, -1); + if (ret > 0) + ret = 0; + /* + * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return */ @@ -5608,7 +8616,63 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; + return ret; +} + +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&fs_info->delalloc_root_mutex); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice) && nr) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput, nr); + btrfs_put_fs_root(root); + if (ret < 0) + goto out; + + if (nr != -1) { + nr -= ret; + WARN_ON(nr < 0); + } + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); + + ret = 0; + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); +out: + if (!list_empty_careful(&splice)) { + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + } + mutex_unlock(&fs_info->delalloc_root_mutex); + return ret; } static int btrfs_symlink(struct inode *dir, struct dentry *dentry, @@ -5622,15 +8686,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - u64 index = 0 ; + u64 index = 0; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - unsigned long nr = 0; - name_len = strlen(symname) + 1; + name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; @@ -5639,55 +8702,55 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, * 2 items for dir items * 1 item for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto out_fail; - btrfs_set_trans_block_group(trans, dir); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; + err = btrfs_find_free_ino(root, &objectid); + if (err) goto out_unlock; - } inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, - &index); - err = PTR_ERR(inode); - if (IS_ERR(inode)) + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFLNK|S_IRWXUGO, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto out_unlock; + } - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); if (drop_inode) goto out_unlock; path = btrfs_alloc_path(); - BUG_ON(!path); - key.objectid = inode->i_ino; + if (!path) { + err = -ENOMEM; + drop_inode = 1; + goto out_unlock; + } + key.objectid = btrfs_ino(inode); key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); datasize = btrfs_file_extent_calc_inline_size(name_len); @@ -5695,6 +8758,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, datasize); if (err) { drop_inode = 1; + btrfs_free_path(path); goto out_unlock; } leaf = path->nodes[0]; @@ -5717,51 +8781,57 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_symlink_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len - 1); + btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); if (err) drop_inode = 1; out_unlock: - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); -out_fail: - btrfs_unreserve_metadata_space(root, 5); + if (!err) + d_instantiate(dentry, inode); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode, loff_t actual_len) +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) { - struct btrfs_trans_handle *trans; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; - u64 num_bytes = end - start; - int ret = 0; u64 i_size; + u64 cur_bytes; + int ret = 0; + bool own_trans = true; + if (trans) + own_trans = false; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 1); - - ret = btrfs_reserve_extent(trans, root, num_bytes, - root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - if (ret) { - WARN_ON(1); - goto stop_trans; + if (own_trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } } - ret = btrfs_reserve_metadata_space(root, 3); + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, + *alloc_hint, &ins, 1, 0); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); - goto stop_trans; + if (own_trans) + btrfs_end_transaction(trans, root); + break; } ret = insert_reserved_file_extent(trans, inode, @@ -5769,22 +8839,59 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ins.offset, ins.offset, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); - BUG_ON(ret); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset, 0); + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + + em->start = cur_offset; + em->orig_start = cur_offset; + em->len = ins.offset; + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->orig_block_len = ins.offset; + em->ram_bytes = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->generation = trans->transid; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset - 1, + 0); + } + free_extent_map(em); +next: num_bytes -= ins.offset; cur_offset += ins.offset; - alloc_hint = ins.objectid + ins.offset; + *alloc_hint = ins.objectid + ins.offset; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size = actual_len; + i_size = actual_len; else i_size = cur_offset; i_size_write(inode, i_size); @@ -5792,133 +8899,116 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 3); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } + + if (own_trans) + btrfs_end_transaction(trans, root); } return ret; +} -stop_trans: - btrfs_end_transaction(trans, root); - return ret; +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) +static int btrfs_set_page_dirty(struct page *page) { - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + umode_t mode = inode->i_mode; - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; + if (mask & MAY_WRITE && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if (btrfs_root_readonly(root)) + return -EROFS; + if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) + return -EACCES; + } + return generic_permission(inode, mask); +} + +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. + * 5 units required for adding orphan entry */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - mutex_lock(&inode->i_mutex); - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) goto out; - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out; - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = prealloc_file_range(inode, - cur_offset, last_byte, - alloc_hint, mode, offset+len); - if (ret < 0) { - free_extent_map(em); - break; - } - } - if (em->block_start <= EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - free_extent_map(em); + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); -static int btrfs_permission(struct inode *inode, int mask) -{ - if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) - return -EACCES; - return generic_permission(inode, mask, btrfs_check_acl); + return ret; } static const struct inode_operations btrfs_dir_inode_operations = { @@ -5938,16 +9028,23 @@ static const struct inode_operations btrfs_dir_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, }; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = btrfs_real_readdir, + .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, @@ -5963,7 +9060,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, @@ -5987,7 +9083,6 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, - .sync_page = block_sync_page, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, @@ -6003,7 +9098,6 @@ static const struct address_space_operations btrfs_symlink_aops = { }; static const struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, @@ -6011,8 +9105,10 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -6022,18 +9118,25 @@ static const struct inode_operations btrfs_special_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, + .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, + .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, + .d_release = btrfs_dentry_release, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e84ef60ffe3..47aceb494d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -40,15 +40,54 @@ #include <linux/xattr.h> #include <linux/vmalloc.h> #include <linux/slab.h> -#include "compat.h" +#include <linux/blkdev.h> +#include <linux/uuid.h> +#include <linux/btrfs.h> +#include <linux/uaccess.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "volumes.h" #include "locking.h" +#include "inode-map.h" +#include "backref.h" +#include "rcu-string.h" +#include "send.h" +#include "dev-replace.h" +#include "props.h" +#include "sysfs.h" +#include "qgroup.h" + +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + + +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -80,6 +119,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) iflags |= FS_NOATIME_FL; if (flags & BTRFS_INODE_DIRSYNC) iflags |= FS_DIRSYNC_FL; + if (flags & BTRFS_INODE_NODATACOW) + iflags |= FS_NOCOW_FL; + + if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) + iflags |= FS_COMPR_FL; + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; return iflags; } @@ -90,25 +136,28 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* * Inherit flags from the parent inode. * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited. */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { @@ -119,18 +168,26 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) flags = BTRFS_I(dir)->flags; - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } - BTRFS_I(inode)->flags = flags; btrfs_update_iflags(inode); } static int btrfs_ioctl_getflags(struct file *file, void __user *arg) { - struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + struct btrfs_inode *ip = BTRFS_I(file_inode(file)); unsigned int flags = btrfs_flags_to_ioctl(ip->flags); if (copy_to_user(arg, &flags, sizeof(flags))) @@ -138,28 +195,56 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | + FS_NOCOW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct btrfs_inode *ip = BTRFS_I(inode); struct btrfs_root *root = ip->root; struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; + u64 ip_oldflags; + unsigned int i_oldflags; + umode_t mode; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (btrfs_root_readonly(root)) + return -EROFS; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL)) - return -EOPNOTSUPP; + ret = check_flags(flags); + if (ret) + return ret; - if (!is_owner_or_cap(inode)) - return -EACCES; + ret = mnt_want_write_file(file); + if (ret) + return ret; mutex_lock(&inode->i_mutex); + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + mode = inode->i_mode; + flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -169,10 +254,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write(file->f_path.mnt); - if (ret) - goto out_unlock; - if (flags & FS_SYNC_FL) ip->flags |= BTRFS_INODE_SYNC; else @@ -197,63 +278,202 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_DIRSYNC; else ip->flags &= ~BTRFS_INODE_DIRSYNC; + if (flags & FS_NOCOW_FL) { + if (S_ISREG(mode)) { + /* + * It's safe to turn csums off here, no extents exist. + * Otherwise we want the flag to reflect the real COW + * status of the file and will not set it. + */ + if (inode->i_size == 0) + ip->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; + } else { + ip->flags |= BTRFS_INODE_NODATACOW; + } + } else { + /* + * Revert back under same assuptions as above + */ + if (S_ISREG(mode)) { + if (inode->i_size == 0) + ip->flags &= ~(BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM); + } else { + ip->flags &= ~BTRFS_INODE_NODATACOW; + } + } + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; + } else if (flags & FS_COMPR_FL) { + const char *comp; + + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + + if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + comp = "lzo"; + else + comp = "zlib"; + ret = btrfs_set_prop(inode, "btrfs.compression", + comp, strlen(comp), 0); + if (ret) + goto out_drop; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + } else { + ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + } - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } btrfs_update_iflags(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } - mnt_drop_write(file->f_path.mnt); out_unlock: mutex_unlock(&inode->i_mutex); - return 0; + mnt_drop_write_file(file); + return ret; } static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); return put_user(inode->i_generation, arg); } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_device *device; + struct request_queue *q; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, + dev_list) { + if (!device->bdev) + continue; + q = bdev_get_queue(device->bdev); + if (blk_queue_discard(q)) { + num_devices++; + minlen = min((u64)q->limits.discard_granularity, + minlen); + } + } + rcu_read_unlock(); + + if (!num_devices) + return -EOPNOTSUPP; + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + if (range.start > total_bytes || + range.len < fs_info->sb->s_blocksize) + return -EINVAL; + + range.len = min(range.len, total_bytes - range.start); + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(fs_info->tree_root, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +int btrfs_is_empty_uuid(u8 *uuid) +{ + int i; + + for (i = 0; i < BTRFS_UUID_SIZE; i++) { + if (uuid[i]) + return 0; + } + return 1; +} + +static noinline int create_subvol(struct inode *dir, struct dentry *dentry, - char *name, int namelen) + char *name, int namelen, + u64 *async_transid, + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; - struct inode *dir = dentry->d_parent->d_inode; + struct btrfs_block_rsv block_rsv; + struct timespec cur_time = CURRENT_TIME; + struct inode *inode; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + u64 qgroup_reserved; + uuid_le new_uuid; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); + if (ret) + return ret; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items + * The same as the snapshot creation, please see the comment + * of create_snapshot(). */ - ret = btrfs_reserve_metadata_space(root, 6); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 8, &qgroup_reserved, false); if (ret) return ret; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_subvolume_release_metadata(root, &block_rsv, + qgroup_reserved); + return ret; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -270,21 +490,25 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + btrfs_header_chunk_tree_uuid(leaf), BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); + memset(&root_item, 0, sizeof(root_item)); + inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_flags(&root_item, 0); + btrfs_set_root_limit(&root_item, 0); + btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); btrfs_set_root_bytenr(&root_item, leaf->start); btrfs_set_root_generation(&root_item, trans->transid); @@ -293,8 +517,15 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_used(&root_item, leaf->len); btrfs_set_root_last_snapshot(&root_item, 0); - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; + btrfs_set_root_generation_v2(&root_item, + btrfs_root_generation(&root_item)); + uuid_le_gen(&new_uuid); + memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); + root_item.ctime = root_item.otime; + btrfs_set_root_ctransid(&root_item, trans->transid); + btrfs_set_root_otransid(&root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); @@ -312,23 +543,37 @@ static noinline int create_subvol(struct btrfs_root *root, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(new_root)); + if (IS_ERR(new_root)) { + btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); + ret = PTR_ERR(new_root); + goto fail; + } btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid, - BTRFS_I(dir)->block_group); + ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + if (ret) { + /* We potentially lose an unused inode item here */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* * insert the directory item */ ret = btrfs_set_inode_index(dir, &index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir->i_ino, &key, + name, namelen, dir, &key, BTRFS_FT_DIR, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto fail; + } btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); @@ -336,78 +581,258 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, - dir->i_ino, index, name, namelen); - + btrfs_ino(dir), index, name, namelen); BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + fail: - err = btrfs_commit_transaction(trans, root); + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + + if (async_transid) { + *async_transid = trans->transid; + err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); + } else { + err = btrfs_commit_transaction(trans, root); + } if (err && !ret) ret = err; - btrfs_unreserve_metadata_space(root, 6); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + } return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) +static void btrfs_wait_nocow_write(struct btrfs_root *root) +{ + s64 writers; + DEFINE_WAIT(wait); + + do { + prepare_to_wait(&root->subv_writers->wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&root->subv_writers->counter); + if (writers) + schedule(); + + finish_wait(&root->subv_writers->wait, &wait); + } while (writers); +} + +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; - /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items - */ - ret = btrfs_reserve_metadata_space(root, 6); + atomic_inc(&root->will_be_snapshoted); + smp_mb__after_atomic(); + btrfs_wait_nocow_write(root); + + ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto fail; + goto out; + + btrfs_wait_ordered_extents(root, -1); pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); - if (!pending_snapshot->name) { - ret = -ENOMEM; - kfree(pending_snapshot); - btrfs_unreserve_metadata_space(root, 6); - goto fail; + goto out; } - memcpy(pending_snapshot->name, name, namelen); - pending_snapshot->name[namelen] = '\0'; + + btrfs_init_block_rsv(&pending_snapshot->block_rsv, + BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + * 1 - UUID item + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 8, + &pending_snapshot->qgroup_reserved, + false); + if (ret) + goto free; + pending_snapshot->dentry = dentry; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); pending_snapshot->root = root; + pending_snapshot->readonly = readonly; + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - btrfs_unreserve_metadata_space(root, 6); + spin_unlock(&root->fs_info->trans_lock); + if (async_transid) { + *async_transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, + root->fs_info->extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_commit_transaction(trans, + root->fs_info->extent_root); + } + if (ret) + goto fail; + + ret = pending_snapshot->error; + if (ret) + goto fail; + + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; } - BUG_ON(!inode); + d_instantiate(dentry, inode); ret = 0; fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +free: + kfree(pending_snapshot); +out: + atomic_dec(&root->will_be_snapshoted); return ret; } +/* copy of check_sticky in fs/namei.c() +* It's inline, so penalty for filesystems that don't use sticky bit is +* minimal. +*/ +static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) +{ + kuid_t fsuid = current_fsuid(); + + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (uid_eq(inode->i_uid, fsuid)) + return 0; + if (uid_eq(dir->i_uid, fsuid)) + return 0; + return !capable(CAP_FOWNER); +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) +{ + int error; + + if (!victim->d_inode) + return -ENOENT; + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (btrfs_check_sticky(dir, victim->d_inode)|| + IS_APPEND(victim->d_inode)|| + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { @@ -425,13 +850,17 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) */ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, - struct btrfs_root *snap_src) + struct btrfs_root *snap_src, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -442,13 +871,19 @@ static noinline int btrfs_mksubvol(struct path *parent, if (dentry->d_inode) goto out_dput; - error = mnt_want_write(parent->mnt); + error = btrfs_may_create(dir, dentry); if (error) goto out_dput; - error = btrfs_may_create(dir, dentry); + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, name, + namelen); if (error) - goto out_drop_write; + goto out_dput; down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); @@ -456,18 +891,16 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen); + error = create_snapshot(snap_src, dir, dentry, name, namelen, + async_transid, readonly, inherit); } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen); + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); -out_drop_write: - mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: @@ -475,27 +908,115 @@ out_unlock: return error; } -static int should_defrag_range(struct inode *inode, u64 start, u64 len, - int thresh, u64 *last_len, u64 *skip, - u64 *defrag_end) +/* + * When we're defragging a range, we don't want to kick it off again + * if it is really just waiting for delalloc to send it down. + * If we find a nice big extent or delalloc range for the bytes in the + * file you want to defrag, we return 0 to let you know to skip this + * part of the file + */ +static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - int ret = 1; + u64 end; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); - if (thresh == 0) - thresh = 256 * 1024; + if (em) { + end = extent_map_end(em); + free_extent_map(em); + if (end - offset > thresh) + return 0; + } + /* if we already have a nice delalloc here, just stop */ + thresh /= 2; + end = count_range_bits(io_tree, &offset, offset + thresh, + thresh, EXTENT_DELALLOC, 1); + if (end >= thresh) + return 0; + return 1; +} - /* - * make sure that once we start defragging and extent, we keep on - * defragging it - */ - if (start < *defrag_end) - return 1; +/* + * helper function to walk through a file and find extents + * newer than a specific transid, and smaller than thresh. + * + * This is used by the defragging code to find new and small + * extents + */ +static int find_new_extents(struct btrfs_root *root, + struct inode *inode, u64 newer_than, + u64 *off, int thresh) +{ + struct btrfs_path *path; + struct btrfs_key min_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + int type; + int ret; + u64 ino = btrfs_ino(inode); - *skip = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + min_key.objectid = ino; + min_key.type = BTRFS_EXTENT_DATA_KEY; + min_key.offset = *off; + + while (1) { + path->keep_locks = 1; + ret = btrfs_search_forward(root, &min_key, path, newer_than); + if (ret != 0) + goto none; + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); +process_slot: + if (min_key.objectid != ino) + goto none; + if (min_key.type != BTRFS_EXTENT_DATA_KEY) + goto none; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_num_bytes(leaf, extent) < thresh && + check_defrag_in_cache(inode, min_key.offset, thresh)) { + *off = min_key.offset; + btrfs_free_path(path); + return 0; + } + + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); + goto process_slot; + } + + if (min_key.offset == (u64)-1) + goto none; + + min_key.offset++; + btrfs_release_path(path); + } +none: + btrfs_free_path(path); + return -ENOENT; +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 len = PAGE_CACHE_SIZE; /* * hopefully we have this extent in the tree already, try without @@ -506,25 +1027,76 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, read_unlock(&em_tree->lock); if (!em) { + struct extent_state *cached = NULL; + u64 end = start + len - 1; + /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + lock_extent_bits(io_tree, start, end, 0, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) - return 0; + return NULL; } + return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ + struct extent_map *next; + bool ret = true; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + next = defrag_lookup_extent(inode, em->start + em->len); + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || + (em->block_start + em->block_len == next->block_start)) + ret = false; + + free_extent_map(next); + return ret; +} + +static int should_defrag_range(struct inode *inode, u64 start, int thresh, + u64 *last_len, u64 *skip, u64 *defrag_end, + int compress) +{ + struct extent_map *em; + int ret = 1; + bool next_mergeable = true; + + /* + * make sure that once we start defragging an extent, we keep on + * defragging it + */ + if (start < *defrag_end) + return 1; + + *skip = 0; + + em = defrag_lookup_extent(inode, start); + if (!em) + return 0; + /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { ret = 0; + goto out; + } + + next_mergeable = defrag_check_next_extent(inode, em); /* - * we hit a real extent, if it is big don't bother defragging it again + * we hit a real extent, if it is big or the next extent is not a + * real extent, don't bother defragging it */ - if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + if (!compress && (*last_len == 0 || *last_len >= thresh) && + (em->len >= thresh || !next_mergeable)) ret = 0; - +out: /* * last_len ends up being a counter of how many bytes we've defragged. * every time we choose not to defrag an extent, we reset *last_len @@ -534,7 +1106,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -546,84 +1117,84 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, return ret; } -static int btrfs_defrag_file(struct file *file, - struct btrfs_ioctl_defrag_range_args *range) +/* + * it doesn't do much good to defrag one or two pages + * at a time. This pulls in a nice chunk of pages + * to COW and defrag. + * + * It also makes sure the delalloc code has enough + * dirty data to avoid making new small extents as part + * of the defrag + * + * It's a good idea to start RA on this range + * before calling this. + */ +static int cluster_pages_for_defrag(struct inode *inode, + struct page **pages, + unsigned long start_index, + unsigned long num_pages) { - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct page *page; - unsigned long last_index; - unsigned long ra_pages = root->fs_info->bdi.ra_pages; - unsigned long total_read = 0; + unsigned long file_end; + u64 isize = i_size_read(inode); u64 page_start; u64 page_end; - u64 last_len = 0; - u64 skip = 0; - u64 defrag_end = 0; - unsigned long i; + u64 page_cnt; int ret; + int i; + int i_done; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - if (inode->i_size == 0) + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || start_index > file_end) return 0; - if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, - range->start + range->len - 1) >> PAGE_CACHE_SHIFT; - } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; - } + page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - i = range->start >> PAGE_CACHE_SHIFT; - while (i <= last_index) { - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, - range->extent_thresh, - &last_len, &skip, - &defrag_end)) { - unsigned long next; - /* - * the should_defrag function tells us how much to skip - * bump our counter by the suggested amount - */ - next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - i = max(i + 1, next); - continue; - } - - if (total_read % ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, - min(last_index, i + ra_pages - 1)); - } - total_read++; - mutex_lock(&inode->i_mutex); - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = 1; + ret = btrfs_delalloc_reserve_space(inode, + page_cnt << PAGE_CACHE_SHIFT); + if (ret) + return ret; + i_done = 0; + tree = &BTRFS_I(inode)->io_tree; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) { - ret = -ENOSPC; + /* step one, lock all the pages */ + for (i = 0; i < page_cnt; i++) { + struct page *page; +again: + page = find_or_create_page(inode->i_mapping, + start_index + i, mask); + if (!page) break; - } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - ret = -ENOSPC; - break; - } -again: - if (inode->i_size == 0 || - i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { - ret = 0; - goto err_reservations; - } + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent_bits(tree, page_start, page_end, + 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent_cached(tree, page_start, page_end, + &cached_state, GFP_NOFS); + if (!ordered) + break; - page = grab_cache_page(inode->i_mapping, i); - if (!page) - goto err_reservations; + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * we unlocked the page above, so we need check if + * it was released or not. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -631,7 +1202,8 @@ again: if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - goto err_reservations; + ret = -EIO; + break; } } @@ -641,55 +1213,253 @@ again: goto again; } - wait_on_page_writeback(page); + pages[i] = page; + i_done++; + } + if (!i_done || ret) + goto out; - if (PageDirty(page)) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - goto loop_unlock; - } + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + goto out; - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + /* + * so now we have a nice long stream of locked + * and up to date pages, lets wait on them + */ + for (i = 0; i < i_done; i++) + wait_on_page_writeback(pages[i]); + + page_start = page_offset(pages[0]); + page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, 0, &cached_state); + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, + page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + &cached_state, GFP_NOFS); + + if (i_done != page_cnt) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); + } - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - set_page_extent_mapped(page); + set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, + &cached_state, GFP_NOFS); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state, + GFP_NOFS); + + for (i = 0; i < i_done; i++) { + clear_page_dirty_for_io(pages[i]); + ClearPageChecked(pages[i]); + set_page_extent_mapped(pages[i]); + set_page_dirty(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + return i_done; +out: + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + return ret; + +} + +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file_ra_state *ra = NULL; + unsigned long last_index; + u64 isize = i_size_read(inode); + u64 last_len = 0; + u64 skip = 0; + u64 defrag_end = 0; + u64 newer_off = range->start; + unsigned long i; + unsigned long ra_index = 0; + int ret; + int defrag_count = 0; + int compress_type = BTRFS_COMPRESS_ZLIB; + int extent_thresh = range->extent_thresh; + unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long cluster = max_cluster; + u64 new_align = ~((u64)128 * 1024 - 1); + struct page **pages = NULL; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = 256 * 1024; + + /* + * if we were not given a file, allocate a readahead + * context + */ + if (!file) { + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + file_ra_state_init(ra, inode->i_mapping); + } else { + ra = &file->f_ra; + } + + pages = kmalloc_array(max_cluster, sizeof(struct page *), + GFP_NOFS); + if (!pages) { + ret = -ENOMEM; + goto out_ra; + } + + /* find the last page to defrag */ + if (range->start + range->len > range->start) { + last_index = min_t(u64, isize - 1, + range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + } else { + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; + } + + if (newer_than) { + ret = find_new_extents(root, inode, newer_than, + &newer_off, 64 * 1024); + if (!ret) { + range->start = newer_off; + /* + * we always align our defrag to help keep + * the extents in the file evenly spaced + */ + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + } else + goto out_ra; + } else { + i = range->start >> PAGE_CACHE_SHIFT; + } + if (!max_to_defrag) + max_to_defrag = last_index + 1; + + /* + * make writeback starts from i, so the defrag range can be + * written sequentially. + */ + if (i < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = i; + + while (i <= last_index && defrag_count < max_to_defrag && + (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { /* - * this makes sure page_mkwrite is called on the - * page if it is dirtied again later + * make sure we stop running if someone unmounts + * the FS */ - clear_page_dirty_for_io(page); - clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, - page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, GFP_NOFS); - - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); - ClearPageChecked(page); - set_page_dirty(page); - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - -loop_unlock: - unlock_page(page); - page_cache_release(page); + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + ret = -EAGAIN; + break; + } + + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + extent_thresh, &last_len, &skip, + &defrag_end, range->flags & + BTRFS_DEFRAG_RANGE_COMPRESS)) { + unsigned long next; + /* + * the should_defrag function tells us how much to skip + * bump our counter by the suggested amount + */ + next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + i = max(i + 1, next); + continue; + } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } + + mutex_lock(&inode->i_mutex); + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + goto out_ra; + } + + defrag_count += ret; + balance_dirty_pages_ratelimited(inode->i_mapping); mutex_unlock(&inode->i_mutex); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); - i++; + if (newer_than) { + if (newer_off == (u64)-1) + break; + + if (ret > 0) + i += ret; + + newer_off = max(newer_off + 1, + (u64)i << PAGE_CACHE_SHIFT); + + ret = find_new_extents(root, inode, + newer_than, &newer_off, + 64 * 1024); + if (!ret) { + range->start = newer_off; + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + } else { + break; + } + } else { + if (ret > 0) { + i += ret; + last_len += ret << PAGE_CACHE_SHIFT; + } else { + i++; + last_len = 0; + } + } } - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { /* the filemap_flush will queue IO into the worker threads, but @@ -704,68 +1474,96 @@ loop_unlock: atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + } - mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = 0; - mutex_unlock(&inode->i_mutex); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); } - return 0; + ret = defrag_count; -err_reservations: - mutex_unlock(&inode->i_mutex); - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); +out_ra: + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + mutex_unlock(&inode->i_mutex); + } + if (!file) + kfree(ra); + kfree(pages); return ret; } -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, +static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { u64 new_size; u64 old_size; u64 devid = 1; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; + char *retptr; char *devstr = NULL; int ret = 0; - int namelen; int mod = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + mnt_drop_write_file(file); + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } + + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { - char *end; sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", - (unsigned long long)devid); + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; + if (!devid) { + ret = -EINVAL; + goto out_free; + } + btrfs_info(root->fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(root, devid, NULL, NULL); + + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", - (unsigned long long)devid); - ret = -EINVAL; - goto out_unlock; + btrfs_info(root->fs_info, "resizer unable to find device %llu", + devid); + ret = -ENODEV; + goto out_free; + } + + if (!device->writeable) { + btrfs_info(root->fs_info, + "resizer unable to apply on readonly device %llu", + devid); + ret = -EPERM; + goto out_free; } + if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; else { @@ -776,102 +1574,308 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, mod = 1; sizestr++; } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } + if (device->is_tgtdev_for_dev_replace) { + ret = -EPERM; + goto out_free; + } + old_size = device->total_bytes; if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -ERANGE; + goto out_free; + } new_size = old_size + new_size; } if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", - device->name, (unsigned long long)new_size); + printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + rcu_str_deref(device->name), new_size); if (new_size > old_size) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_free; + } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); - } + } /* equal, nothing need to do */ -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_transid(struct file *file, + char *name, unsigned long fd, int subvol, + u64 *transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + int namelen; + int ret = 0; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + namelen = strlen(name); + if (strchr(name, '/')) { + ret = -EINVAL; + goto out_drop_write; + } + + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out_drop_write; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + NULL, transid, readonly, inherit); + } else { + struct fd src = fdget(fd); + struct inode *src_inode; + if (!src.file) { + ret = -EINVAL; + goto out_drop_write; + } + + src_inode = file_inode(src.file); + if (src_inode->i_sb != file_inode(file)->i_sb) { + btrfs_info(BTRFS_I(src_inode)->root->fs_info, + "Snapshot src from another FS"); + ret = -EXDEV; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; + } else { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid, readonly, inherit); + } + fdput(src); + } +out_drop_write: + mnt_drop_write_file(file); +out: return ret; } static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct file *src_file; - int namelen; - int ret = 0; + int ret; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false, NULL); + + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; + struct btrfs_qgroup_inherit *inherit = NULL; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/')) { - ret = -EINVAL; + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | + BTRFS_SUBVOL_QGROUP_INHERIT)) { + ret = -EOPNOTSUPP; goto out; } - if (subvol) { - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - NULL); - } else { - struct inode *src_inode; - src_file = fget(vol_args->fd); - if (!src_file) { + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { + if (vol_args->size > PAGE_CACHE_SIZE) { ret = -EINVAL; goto out; } - - src_inode = src_file->f_path.dentry->d_inode; - if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { - printk(KERN_INFO "btrfs: Snapshot src from " - "another FS\n"); - ret = -EINVAL; - fput(src_file); + inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); + if (IS_ERR(inherit)) { + ret = PTR_ERR(inherit); goto out; } - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - BTRFS_I(src_inode)->root); - fput(src_file); } + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, ptr, + readonly, inherit); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; out: kfree(vol_args); + kfree(inherit); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + goto out; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out_drop_write; + } + + if (copy_from_user(&flags, arg, sizeof(flags))) { + ret = -EFAULT; + goto out_drop_write; + } + + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out_drop_write; + } + + if (flags & ~BTRFS_SUBVOL_RDONLY) { + ret = -EOPNOTSUPP; + goto out_drop_write; + } + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out_drop_sem; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) { + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out_drop_sem: + up_write(&root->fs_info->subvol_sem); +out_drop_write: + mnt_drop_write_file(file); +out: return ret; } @@ -881,13 +1885,30 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_path *path; + struct btrfs_dir_item *di; struct btrfs_key key; + u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -EPERM; + btrfs_err(root->fs_info, "deleting default subvolume " + "%llu is not allowed", key.objectid); + goto out; + } + btrfs_release_path(path); + } + key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -939,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - char *buf, + size_t *buf_size, + char __user *ubuf, unsigned long *sk_offset, int *num_found) { @@ -951,7 +1973,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, int nritems; int i; int slot; - int found = 0; int ret = 0; leaf = path->nodes[0]; @@ -968,19 +1989,31 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; item_len = 0; + ret = -EOVERFLOW; + } - if (sizeof(sh) + item_len + *sk_offset > - BTRFS_SEARCH_ARGS_BUFSIZE) { + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; - goto overflow; + goto out; } - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; @@ -988,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root, sh.transid = found_transid; /* copy search result header */ - memcpy(buf + *sk_offset, &sh, sizeof(sh)); + if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = -EFAULT; + goto out; + } + *sk_offset += sizeof(sh); if (item_len) { - char *p = buf + *sk_offset; + char __user *up = ubuf + *sk_offset; /* copy the item */ - read_extent_buffer(leaf, p, - item_off, item_len); + if (read_extent_buffer_to_user(leaf, up, + item_off, item_len)) { + ret = -EFAULT; + goto out; + } + *sk_offset += item_len; } - found++; + (*num_found)++; - if (*num_found >= sk->nr_items) - break; + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } } advance_key: ret = 0; @@ -1016,24 +2062,37 @@ advance_key: key->objectid++; } else ret = 1; -overflow: - *num_found += found; +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ return ret; } static noinline int search_ioctl(struct inode *inode, - struct btrfs_ioctl_search_args *args) + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf) { struct btrfs_root *root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path; - struct btrfs_ioctl_search_key *sk = &args->key; struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; int ret; int num_found = 0; unsigned long sk_offset = 0; + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1047,7 +2106,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", + printk(KERN_ERR "BTRFS: could not find root %llu\n", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -1058,28 +2117,24 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk->min_type; key.offset = sk->min_offset; - max_key.objectid = sk->max_objectid; - max_key.type = sk->max_type; - max_key.offset = sk->max_offset; - path->keep_locks = 1; - while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0, - sk->min_transid); + while (1) { + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, args->buf, + ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); - btrfs_release_path(root, path); - if (ret || num_found >= sk->nr_items) + btrfs_release_path(path); + if (ret) break; } - ret = 0; + if (ret > 0) + ret = 0; err: sk->nr_items = num_found; btrfs_free_path(path); @@ -1089,26 +2144,73 @@ err: static noinline int btrfs_ioctl_tree_search(struct file *file, void __user *argp) { - struct btrfs_ioctl_search_args *args; - struct inode *inode; - int ret; + struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_key sk; + struct inode *inode; + int ret; + size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; - } - inode = fdentry(file)->d_inode; - ret = search_ioctl(inode, args); - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + + buf_size = sizeof(uargs->buf); + + inode = file_inode(file); + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; - kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 args; + struct inode *inode; + int ret; + size_t buf_size; + const size_t buf_limit = 16 * 1024 * 1024; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + if (buf_size < sizeof(struct btrfs_ioctl_search_header)) + return -EOVERFLOW; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + inode = file_inode(file); + ret = search_ioctl(inode, &args.key, &buf_size, + (char *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + return ret; } @@ -1146,7 +2248,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", tree_id); + printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); ret = -ENOENT; goto out; } @@ -1155,46 +2257,47 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - while(1) { + while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } l = path->nodes[0]; slot = path->slots[0]; - if (ret > 0 && slot > 0) - slot--; btrfs_item_key_to_cpu(l, &key, slot); - if (ret > 0 && (key.objectid != dirid || - key.type != BTRFS_INODE_REF_KEY)) { - ret = -ENOENT; - goto out; - } - iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); ptr -= len + 1; total_len += len + 1; - if (ptr < name) + if (ptr < name) { + ret = -ENAMETOOLONG; goto out; + } *(ptr + len) = '/'; - read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); + read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; - btrfs_release_path(root, path); + btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; - } - if (ptr < name) - goto out; - memcpy(name, ptr, total_len); - name[total_len]='\0'; + memmove(name, ptr, total_len); + name[total_len] = '\0'; ret = 0; out: btrfs_free_path(path); @@ -1211,15 +2314,11 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); - return -EFAULT; - } - inode = fdentry(file)->d_inode; + inode = file_inode(file); if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; @@ -1238,7 +2337,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { - struct dentry *parent = fdentry(file); + struct dentry *parent = file->f_path.dentry; struct dentry *dentry; struct inode *dir = parent->d_inode; struct inode *inode; @@ -1246,13 +2345,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 root_flags; + u64 qgroup_reserved; int namelen; int ret; int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1265,11 +2364,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -1282,14 +2384,73 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } inode = dentry->d_inode; - if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)) { + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ err = -EINVAL; - goto out_dput; + if (root == dest) + goto out_dput; + + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; } - dest = BTRFS_I(inode)->root; + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_dput; + } + err = d_invalidate(dentry); if (err) goto out_unlock; @@ -1300,12 +2461,33 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; - trans = btrfs_start_transaction(root, 1); + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, two for dir entries, two for root + * ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 5, &qgroup_reserved, true); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_release; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, root, ret); + goto out_end_trans; + } btrfs_record_root_in_trans(trans, dest); @@ -1314,28 +2496,75 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, - dest->root_key.objectid); - BUG_ON(ret); + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } + +out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + ret = btrfs_end_transaction(trans, root); + if (ret && !err) + err = ret; inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + ASSERT(dest->send_in_progress == 0); + + /* the last ref */ + if (dest->cache_inode) { + iput(dest->cache_inode); + dest->cache_inode = NULL; + } } out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); - mnt_drop_write(file->f_path.mnt); +out_drop_write: + mnt_drop_write_file(file); out: kfree(vol_args); return err; @@ -1343,23 +2572,30 @@ out: static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_defrag_range_args *range; int ret; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } - btrfs_defrag_root(root, 0); - btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -1389,12 +2625,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - btrfs_defrag_file(file, range); + ret = btrfs_defrag_file(file_inode(file), file, + range, 0, 0); + if (ret > 0) + ret = 0; kfree(range); break; + default: + ret = -EINVAL; } out: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -1406,168 +2647,628 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } + + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_vol_args *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); ret = btrfs_rm_device(root, vol_args->name); + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out: kfree(vol_args); + mnt_drop_write_file(file); return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_fs_info_args *fi_args; + struct btrfs_device *device; + struct btrfs_device *next; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + mutex_lock(&fs_devices->device_list_mutex); + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); + + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; + } + mutex_unlock(&fs_devices->device_list_mutex); + + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; + + kfree(fi_args); + return ret; +} + +static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_info_args *di_args; + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + char *s_uuid = NULL; + + di_args = memdup_user(arg, sizeof(*di_args)); + if (IS_ERR(di_args)) + return PTR_ERR(di_args); + + if (!btrfs_is_empty_uuid(di_args->uuid)) + s_uuid = di_args->uuid; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); + + if (!dev) { + ret = -ENODEV; + goto out; + } + + di_args->devid = dev->devid; + di_args->bytes_used = dev->bytes_used; + di_args->total_bytes = dev->total_bytes; + memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + if (dev->name) { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + strncpy(di_args->path, name->str, sizeof(di_args->path)); + rcu_read_unlock(); + di_args->path[sizeof(di_args->path) - 1] = 0; + } else { + di_args->path[0] = '\0'; + } + +out: + mutex_unlock(&fs_devices->device_list_mutex); + if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) + ret = -EFAULT; + + kfree(di_args); + return ret; +} + +static struct page *extent_same_get_page(struct inode *inode, u64 off) +{ + struct page *page; + pgoff_t index; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + index = off >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, + 0)) + return NULL; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + } + unlock_page(page); + + return page; +} + +static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +{ + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, + off + len - 1); + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && + !test_range_bit(&BTRFS_I(inode)->io_tree, off, + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(inode, off, len); + } +} + +static void btrfs_double_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); +} + +static void btrfs_double_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + lock_extent_range(inode1, loff1, len); + if (inode1 != inode2) { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + lock_extent_range(inode2, loff2, len); + } +} + +static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, + u64 dst_loff, u64 len) +{ + int ret = 0; + struct page *src_page, *dst_page; + unsigned int cmp_len = PAGE_CACHE_SIZE; + void *addr, *dst_addr; + + while (len) { + if (len < PAGE_CACHE_SIZE) + cmp_len = len; + + src_page = extent_same_get_page(src, loff); + if (!src_page) + return -EINVAL; + dst_page = extent_same_get_page(dst, dst_loff); + if (!dst_page) { + page_cache_release(src_page); + return -EINVAL; + } + addr = kmap_atomic(src_page); + dst_addr = kmap_atomic(dst_page); + + flush_dcache_page(src_page); + flush_dcache_page(dst_page); + + if (memcmp(addr, dst_addr, cmp_len)) + ret = BTRFS_SAME_DATA_DIFFERS; + + kunmap_atomic(addr); + kunmap_atomic(dst_addr); + page_cache_release(src_page); + page_cache_release(dst_page); + + if (ret) + break; + + loff += cmp_len; + dst_loff += cmp_len; + len -= cmp_len; + } + + return ret; +} + +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +{ + u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; + + if (off + len > inode->i_size || off + len < off) + return -EINVAL; + /* Check that we are block aligned - btrfs_clone() requires this */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) + return -EINVAL; + + return 0; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) { - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file *src_file; - struct inode *src; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *buf; - struct btrfs_key key; - u32 nritems; - int slot; int ret; - u64 len = olen; - u64 bs = root->fs_info->sb->s_blocksize; - u64 hint_byte; /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * btrfs_clone() can't handle extents in the same file + * yet. Once that works, we can drop this check and replace it + * with a check for the same inode, but overlapping extents. */ + if (src == dst) + return -EINVAL; - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, len); + if (ret) + goto out_unlock; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + if (ret == 0) + ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + +out_unlock: + btrfs_double_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) + +static long btrfs_ioctl_file_extent_same(struct file *file, + struct btrfs_ioctl_same_args __user *argp) +{ + struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_extent_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + unsigned long size; + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + bool is_admin = capable(CAP_SYS_ADMIN); + u16 count; + + if (!(file->f_mode & FMODE_READ)) return -EINVAL; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; - src_file = fget(srcfd); - if (!src_file) { - ret = -EBADF; - goto out_drop_write; + if (get_user(count, &argp->dest_count)) { + ret = -EFAULT; + goto out; } - src = src_file->f_dentry->d_inode; - ret = -EINVAL; - if (src == inode) - goto out_fput; + size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); + + same = memdup_user(argp, size); + + if (IS_ERR(same)) { + ret = PTR_ERR(same); + goto out; + } + + off = same->logical_offset; + len = same->length; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > BTRFS_MAX_DEDUPE_LEN) + len = BTRFS_MAX_DEDUPE_LEN; + + if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + ret = -EINVAL; + goto out; + } ret = -EISDIR; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; + if (S_ISDIR(src->i_mode)) + goto out; - ret = -EXDEV; - if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) - goto out_fput; + ret = -EACCES; + if (!S_ISREG(src->i_mode)) + goto out; - ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); - if (!buf) - goto out_fput; + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = 0; + } - path = btrfs_alloc_path(); - if (!path) { - vfree(buf); - goto out_fput; + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_file = fdget(info->fd); + if (!dst_file.file) { + info->status = -EBADF; + continue; + } + dst = file_inode(dst_file.file); + + if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + } else if (!S_ISREG(dst->i_mode)) { + info->status = -EACCES; + } else { + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; + } + fdput(dst_file); } - path->reada = 2; - if (inode < src) { - mutex_lock(&inode->i_mutex); - mutex_lock(&src->i_mutex); - } else { - mutex_lock(&src->i_mutex); - mutex_lock(&inode->i_mutex); + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; + +out: + mnt_drop_write_file(file); + return ret; +} + +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = {}; + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} - /* determine range to clone */ - ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) - len = ((src->i_size + bs-1) & ~(bs-1)) - - off; +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; - /* verify the end result is block aligned */ - if ((off & (bs-1)) || - ((off + len) & (bs-1))) - goto out_unlock; + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, off+len); - if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - if (ordered) - btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, off+len); + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); } - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen, extent_same uses + * identical values here + * @destoff: Offset within @inode to start clone + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; - /* punch hole in destination first */ - btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + return ret; + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + return ret; + } + + path->reada = 2; /* clone data */ - key.objectid = src->i_ino; + key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* * note the key will change type as we walk through the * tree. */ - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); if (ret < 0) goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + no_quota = 1; if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) goto out; if (ret > 0) @@ -1579,7 +3280,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != src->i_ino) + key.objectid != btrfs_ino(src)) break; if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { @@ -1590,11 +3291,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -1614,22 +3311,94 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datal = btrfs_file_extent_ram_bytes(leaf, extent); } - btrfs_release_path(root, path); - if (key.offset + datal < off || - key.offset >= off+len) - goto next; + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; + new_key.objectid = btrfs_ino(inode); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* + * 1 - adjusting old extent (we may have to split it) + * 1 - add new extent + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + ret = btrfs_drop_extents(trans, root, inode, + drop_start, + new_key.offset + datal, + 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); goto out; + } leaf = path->nodes[0]; slot = path->slots[0]; @@ -1640,14 +3409,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - if (key.offset + datal > off + len) - datal = off + len - key.offset; - /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1656,36 +3417,89 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao); btrfs_set_file_extent_num_bytes(leaf, extent, datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, - inode->i_ino, - new_key.offset - datao); - BUG_ON(ret); + btrfs_ino(inode), + new_key.offset - datao, + no_quota); + if (ret) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + goto out; + + } } } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; } - if (key.offset + datal > off+len) - trim = key.offset + datal - (off+len); + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); if (comp && (skip || trim)) { ret = -EINVAL; + btrfs_end_transaction(trans, root); goto out; } size -= skip + trim; datal -= skip + trim; + + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); + ret = btrfs_drop_extents(trans, root, inode, + drop_start, + aligned_end, + 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); goto out; + } if (skip) { u32 start = @@ -1702,36 +3516,224 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, inode_add_bytes(inode, datal); } + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, 0, 0); + btrfs_mark_buffer_dirty(leaf); - } + btrfs_release_path(path); -next: - btrfs_release_path(root, path); + last_dest_end = new_key.offset + datal; + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + } + btrfs_release_path(path); key.offset++; } ret = 0; -out: - btrfs_release_path(root, path); - if (ret == 0) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (destoff + olen > inode->i_size) - btrfs_i_size_write(inode, destoff + olen); - BTRFS_I(inode)->flags = BTRFS_I(src)->flags; - ret = btrfs_update_inode(trans, root, inode); + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); } - btrfs_end_transaction(trans, root); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + +out: + btrfs_free_path(path); + vfree(buf); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct fd src_file; + struct inode *src; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + return -EINVAL; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write_file(file); if (ret) - vmtruncate(inode, 0); + return ret; + + src_file = fdget(srcfd); + if (!src_file.file) { + ret = -EBADF; + goto out_drop_write; + } + + ret = -EXDEV; + if (src_file.file->f_path.mnt != file->f_path.mnt) + goto out_fput; + + src = file_inode(src_file.file); + + ret = -EINVAL; + if (src == inode) + same_inode = 1; + + /* the src must be open for reading */ + if (!(src_file.file->f_mode & FMODE_READ)) + goto out_fput; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } + } else { + mutex_lock(&src->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off + len > src->i_size || off + len < off) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) + goto out_unlock; + + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; + + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } + + ret = btrfs_clone(src, inode, off, olen, len, destoff); + + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - vfree(buf); - btrfs_free_path(path); + if (!same_inode) { + if (inode < src) { + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + } else { + mutex_unlock(&inode->i_mutex); + mutex_unlock(&src->i_mutex); + } + } else { + mutex_unlock(&src->i_mutex); + } out_fput: - fput(src_file); + fdput(src_file); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -1753,7 +3755,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) */ static long btrfs_ioctl_trans_start(struct file *file) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; @@ -1766,34 +3768,34 @@ static long btrfs_ioctl_trans_start(struct file *file) if (file->private_data) goto out; - ret = mnt_want_write(file->f_path.mnt); + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + + ret = mnt_want_write_file(file); if (ret) goto out; - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans++; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_inc(&root->fs_info->open_ioctl_trans); ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + trans = btrfs_start_ioctl_transaction(root); + if (IS_ERR(trans)) goto out_drop; file->private_data = trans; return 0; out_drop: - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); - mnt_drop_write(file->f_path.mnt); + atomic_dec(&root->fs_info->open_ioctl_trans); + mnt_drop_write_file(file); out: return ret; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -1801,51 +3803,59 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_path *path; struct btrfs_key location; struct btrfs_disk_key disk_key; - struct btrfs_super_block *disk_super; - u64 features; u64 objectid = 0; u64 dir_id; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&objectid, argp, sizeof(objectid))) - return -EFAULT; + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } if (!objectid) - objectid = root->root_key.objectid; + objectid = BTRFS_FS_TREE_OBJECTID; location.objectid = objectid; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - if (IS_ERR(new_root)) - return PTR_ERR(new_root); - - if (btrfs_root_refs(&new_root->root_item) == 0) - return -ENOENT; + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { btrfs_free_path(path); - return -ENOMEM; + ret = PTR_ERR(trans); + goto out; } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); - if (!di) { + if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); - printk(KERN_ERR "Umm, you don't have the default dir item, " - "this isn't going to work\n"); - return -ENOENT; + btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" + "item, this isn't going to work"); + ret = -ENOENT; + goto out; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); @@ -1853,46 +3863,92 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { - features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; - btrfs_set_super_incompat_flags(disk_super, features); - } + btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans, root); +out: + mnt_drop_write_file(file); + return ret; +} - return 0; +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } } -long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; - struct btrfs_ioctl_space_info *user_dest; + struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; int alloc_size; int ret = 0; - int slot_count = 0; + u64 slot_count = 0; + int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; - /* first we count slots */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) - slot_count++; - rcu_read_unlock(); + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } + + /* + * Global block reserve, exported as a space_info + */ + slot_count++; /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } + + slot_count = min_t(u64, space_args.space_slots, slot_count); + alloc_size = sizeof(*dest) * slot_count; + /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ @@ -1906,29 +3962,57 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) dest_orig = dest; /* now we have a buffer to copy into */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { - /* make sure we don't copy more than we allocated - * in our buffer - */ - if (slot_count == 0) - break; - slot_count--; + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; - /* make sure userland has enough room in their buffer */ - if (space_args.total_spaces >= space_args.space_slots) + if (!slot_count) break; - space.flags = info->flags; - space.total_bytes = info->total_bytes; - space.used_bytes = info->bytes_used; + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + btrfs_get_block_group_info( + &info->block_groups[c], &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + slot_count--; + } + if (!slot_count) + break; + } + up_read(&info->groups_sem); + } + + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; memcpy(dest, &space, sizeof(space)); - dest++; space_args.total_spaces++; } - rcu_read_unlock(); - user_dest = (struct btrfs_ioctl_space_info *) + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) @@ -1950,7 +4034,7 @@ out: */ long btrfs_ioctl_trans_end(struct file *file) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -1961,18 +4045,1200 @@ long btrfs_ioctl_trans_end(struct file *file) btrfs_end_transaction(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); + + mnt_drop_write_file(file); + return 0; +} + +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) +{ + struct btrfs_trans_handle *trans; + u64 transid; + int ret; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); - mnt_drop_write(file->f_path.mnt); + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } + transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } +out: + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; return 0; } +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, + void __user *argp) +{ + u64 transid; + + if (argp) { + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + } else { + transid = 0; /* current trans */ + } + return btrfs_wait_for_commit(root, transid); +} + +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_scrub_cancel(root->fs_info); +} + +static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { + kfree(sa); + return -EPERM; + } + + ret = btrfs_get_dev_stats(root, sa); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (root->fs_info->sb->s_flags & MS_RDONLY) { + ret = -EROFS; + goto out; + } + if (atomic_xchg( + &root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + } else { + ret = btrfs_dev_replace_start(root, p); + atomic_set( + &root->fs_info->mutually_exclusive_operation_running, + 0); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(root->fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + ret = btrfs_dev_replace_cancel(root->fs_info, p); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_DAC_READ_SEARCH)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 64 * 1024); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, + build_ino_list, inodes); + if (ret == -EINVAL) + ret = -ENOENT; + if (ret < 0) + goto out; + + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + vfree(inodes); + kfree(loi); + + return ret; +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } +} + +static long btrfs_ioctl_balance(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out_unlock; + } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } + } else { + bargs = NULL; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + } + +do_balance: + /* + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. + */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out_unlock: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_ctl_args *sa; + struct btrfs_trans_handle *trans = NULL; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + down_write(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + switch (sa->cmd) { + case BTRFS_QUOTA_CTL_ENABLE: + ret = btrfs_quota_enable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_DISABLE: + ret = btrfs_quota_disable(trans, root->fs_info); + break; + default: + ret = -EINVAL; + break; + } + + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); + if (err && !ret) + ret = err; +out: + kfree(sa); + up_write(&root->fs_info->subvol_sem); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->assign) { + ret = btrfs_add_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } else { + ret = btrfs_del_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_create_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->create) { + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, + NULL); + } else { + ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_qgroup_limit_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + u64 qgroupid; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + qgroupid = sa->qgroupid; + if (!qgroupid) { + /* take the current subvol as qgroup */ + qgroupid = root->root_key.objectid; + } + + /* FIXME: check if the IDs really exist */ + ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + qsa = memdup_user(arg, sizeof(*qsa)); + if (IS_ERR(qsa)) { + ret = PTR_ERR(qsa); + goto drop_write; + } + + if (qsa->flags) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_qgroup_rescan(root->fs_info); + +out: + kfree(qsa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + if (!qsa) + return -ENOMEM; + + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + qsa->flags = 1; + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; + } + + if (copy_to_user(arg, qsa, sizeof(*qsa))) + ret = -EFAULT; + + kfree(qsa); + return ret; +} + +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct btrfs_ioctl_received_subvol_args *sa) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct timespec ct = CURRENT_TIME; + int ret = 0; + int received_uuid_changed; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret < 0) + return ret; + + down_write(&root->fs_info->subvol_sem); + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out; + } + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + /* + * 1 - root item + * 2 - uuid items (received uuid + subvol uuid) + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + sa->rtransid = trans->transid; + sa->rtime.sec = ct.tv_sec; + sa->rtime.nsec = ct.tv_nsec; + + received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, + BTRFS_UUID_SIZE); + if (received_uuid_changed && + !btrfs_is_empty_uuid(root_item->received_uuid)) + btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); + btrfs_set_root_stransid(root_item, sa->stransid); + btrfs_set_root_rtransid(root_item, sa->rtransid); + btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); + btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); + btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); + btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans, root); + goto out; + } + if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + sa->uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret < 0 && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + } + ret = btrfs_commit_transaction(trans, root); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + +out: + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) { + ret = PTR_ERR(args32); + args32 = NULL; + goto out; + } + + args64 = kmalloc(sizeof(*args64), GFP_NOFS); + if (!args64) { + ret = -ENOMEM; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + ret = _btrfs_ioctl_set_received_subvol(file, sa); + + if (ret) + goto out; + + ret = copy_to_user(arg, sa, sizeof(*sa)); + if (ret) + ret = -EFAULT; + +out: + kfree(sa); + return ret; +} + +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + size_t len; + int ret; + char label[BTRFS_LABEL_SIZE]; + + spin_lock(&root->fs_info->super_lock); + memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&root->fs_info->super_lock); + + len = strnlen(label, BTRFS_LABEL_SIZE); + + if (len == BTRFS_LABEL_SIZE) { + btrfs_warn(root->fs_info, + "label is too long, return the first %zu bytes", --len); + } + + ret = copy_to_user(arg, label, len); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + btrfs_err(root->fs_info, "unable to set label with more than %d bytes", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + spin_lock(&root->fs_info->super_lock); + strcpy(super_block->label, label); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + +out_unlock: + mnt_drop_write_file(file); + return ret; +} + +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +static int btrfs_ioctl_get_supported_features(struct file *file, + void __user *arg) +{ + static struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_root *root, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_names[set]; + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(root->fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(root, change_mask, flags, mask_base) \ +check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(root, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(root, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(root, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&root->fs_info->super_lock); + + return btrfs_commit_transaction(trans, root); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; void __user *argp = (void __user *)arg; switch (cmd) { @@ -1982,12 +5248,22 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SNAP_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SUBVOL_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: @@ -1995,13 +5271,17 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, argp); + return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, argp); + return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_FS_INFO: + return btrfs_ioctl_fs_info(root, argp); + case BTRFS_IOC_DEV_INFO: + return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -2012,13 +5292,79 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_end(file); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_SYNC: - btrfs_sync_fs(file->f_dentry->d_sb, 1); - return 0; + case BTRFS_IOC_SYNC: { + int ret; + + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + if (ret) + return ret; + ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + return ret; + } + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(root, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(root, argp); + case BTRFS_IOC_SCRUB: + return btrfs_ioctl_scrub(file, argp); + case BTRFS_IOC_SCRUB_CANCEL: + return btrfs_ioctl_scrub_cancel(root, argp); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(file, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_SET_RECEIVED_SUBVOL: + return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif + case BTRFS_IOC_SEND: + return btrfs_ioctl_send(file, argp); + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp); + case BTRFS_IOC_QUOTA_CTL: + return btrfs_ioctl_quota_ctl(file, argp); + case BTRFS_IOC_QGROUP_ASSIGN: + return btrfs_ioctl_qgroup_assign(file, argp); + case BTRFS_IOC_QGROUP_CREATE: + return btrfs_ioctl_qgroup_create(file, argp); + case BTRFS_IOC_QGROUP_LIMIT: + return btrfs_ioctl_qgroup_limit(file, argp); + case BTRFS_IOC_QUOTA_RESCAN: + return btrfs_ioctl_quota_rescan(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_STATUS: + return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(root, argp); + case BTRFS_IOC_GET_FSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case BTRFS_IOC_SET_FSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); + case BTRFS_IOC_FILE_EXTENT_SAME: + return btrfs_ioctl_file_extent_same(file, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(file, argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(file, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index 424694aa517..00000000000 --- a/fs/btrfs/ioctl.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __IOCTL_ -#define __IOCTL_ -#include <linux/ioctl.h> - -#define BTRFS_IOCTL_MAGIC 0x94 -#define BTRFS_VOL_NAME_MAX 255 -#define BTRFS_PATH_NAME_MAX 4087 - -/* this should be 4k */ -struct btrfs_ioctl_vol_args { - __s64 fd; - char name[BTRFS_PATH_NAME_MAX + 1]; -}; - -#define BTRFS_INO_LOOKUP_PATH_MAX 4080 -struct btrfs_ioctl_ino_lookup_args { - __u64 treeid; - __u64 objectid; - char name[BTRFS_INO_LOOKUP_PATH_MAX]; -}; - -struct btrfs_ioctl_search_key { - /* which root are we searching. 0 is the tree of tree roots */ - __u64 tree_id; - - /* keys returned will be >= min and <= max */ - __u64 min_objectid; - __u64 max_objectid; - - /* keys returned will be >= min and <= max */ - __u64 min_offset; - __u64 max_offset; - - /* max and min transids to search for */ - __u64 min_transid; - __u64 max_transid; - - /* keys returned will be >= min and <= max */ - __u32 min_type; - __u32 max_type; - - /* - * how many items did userland ask for, and how many are we - * returning - */ - __u32 nr_items; - - /* align to 64 bits */ - __u32 unused; - - /* some extra for later */ - __u64 unused1; - __u64 unused2; - __u64 unused3; - __u64 unused4; -}; - -struct btrfs_ioctl_search_header { - __u64 transid; - __u64 objectid; - __u64 offset; - __u32 type; - __u32 len; -}; - -#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) -/* - * the buf is an array of search headers where - * each header is followed by the actual item - * the type field is expanded to 32 bits for alignment - */ -struct btrfs_ioctl_search_args { - struct btrfs_ioctl_search_key key; - char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; -}; - -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; - -/* flags for the defrag range ioctl */ -#define BTRFS_DEFRAG_RANGE_COMPRESS 1 -#define BTRFS_DEFRAG_RANGE_START_IO 2 - -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* spare for later */ - __u32 unused[5]; -}; - -struct btrfs_ioctl_space_info { - __u64 flags; - __u64 total_bytes; - __u64 used_bytes; -}; - -struct btrfs_ioctl_space_args { - __u64 space_slots; - __u64 total_spaces; - struct btrfs_ioctl_space_info spaces[0]; -}; - -#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ - struct btrfs_ioctl_vol_args) -/* trans start and trans end are dangerous, and only for - * use by applications that know how to avoid the - * resulting deadlocks - */ -#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) -#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) -#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) - -#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) -#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ - struct btrfs_ioctl_vol_args) - -#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ - struct btrfs_ioctl_clone_range_args) - -#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ - struct btrfs_ioctl_defrag_range_args) -#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ - struct btrfs_ioctl_search_args) -#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ - struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) -#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ - struct btrfs_ioctl_space_args) -#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6151f2ea38b..5665d214924 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,210 +24,259 @@ #include "extent_io.h" #include "locking.h" -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock(&eb->lock); -} +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* - * Setting a lock to blocking will drop the spinlock and set the - * flag that forces other procs who want the lock to wait. After - * this you can safely schedule with the lock held. + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. */ -void btrfs_set_lock_blocking(struct extent_buffer *eb) +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - spin_unlock(&eb->lock); + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (rw == BTRFS_WRITE_LOCK) { + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); + } + } else if (rw == BTRFS_READ_LOCK) { + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); } - /* exit with the spin lock released and the bit set */ + return; } /* - * clearing the blocking flag will take the spinlock again. - * After this you can't safely schedule + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count */ -void btrfs_clear_lock_blocking(struct extent_buffer *eb) +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - spin_nested(eb); - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - smp_mb__after_clear_bit(); + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + if (atomic_dec_and_test(&eb->blocking_writers) && + waitqueue_active(&eb->write_lock_wq)) + wake_up(&eb->write_lock_wq); + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) + wake_up(&eb->read_lock_wq); } - /* exit with the spin lock held */ + return; } /* - * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for very long, and often they don't block - * at all. For a dbench 50 run, if we don't spin on the blocking bit - * at all, the context switch rate can jump up to 400,000/sec or more. - * - * So, we're still stuck with this crummy spin on the blocking bit, - * at least until the most common causes of the short blocks - * can be dealt with. + * take a spinning read lock. This will wait for any blocking + * writers */ -static int btrfs_spin_on_block(struct extent_buffer *eb) +void btrfs_tree_read_lock(struct extent_buffer *eb) { - int i; - - for (i = 0; i < 512; i++) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - if (need_resched()) - break; - cpu_relax(); +again: + BUG_ON(!atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner); + + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. We allow + * an additional read lock to be added because it's for the same + * thread. btrfs_find_all_roots() depends on this as it may be + * called on a partly (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = 1; + read_unlock(&eb->lock); + return; } - return 0; + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); } /* - * This is somewhat different from trylock. It will take the - * spinlock but if it finds the lock is set to blocking, it will - * return without the lock held. - * - * returns 1 if it was able to take the lock and zero otherwise - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers */ -int btrfs_try_spin_lock(struct extent_buffer *eb) +int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - int i; + if (atomic_read(&eb->blocking_writers)) + return 0; - if (btrfs_spin_on_block(eb)) { - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - } - /* spin for a bit on the BLOCKING flag */ - for (i = 0; i < 2; i++) { - cpu_relax(); - if (!btrfs_spin_on_block(eb)) - break; - - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); + if (!read_trylock(&eb->lock)) + return 0; + + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; } /* - * the autoremove wake function will return 0 if it tried to wake up - * a process that was already awake, which means that process won't - * count as an exclusive wakeup. The waitq code will continue waking - * procs until it finds one that was actually sleeping. - * - * For btrfs, this isn't quite what we want. We want a single proc - * to be notified that the lock is ready for taking. If that proc - * already happen to be awake, great, it will loop around and try for - * the lock. - * - * So, btrfs_wake_function always returns 1, even when the proc that we - * tried to wake up was already awake. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers */ -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) +int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - autoremove_wake_function(wait, mode, sync, key); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) + return 0; + + if (!write_trylock(&eb->lock)) + return 0; + + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->write_locks); + atomic_inc(&eb->spinning_writers); + eb->lock_owner = current->pid; return 1; } /* - * returns with the extent buffer spinlocked. - * - * This will spin and/or wait as required to take the lock, and then - * return with the spinlock held. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * drop a spinning read lock */ -int btrfs_tree_lock(struct extent_buffer *eb) +void btrfs_tree_read_unlock(struct extent_buffer *eb) { - DEFINE_WAIT(wait); - wait.func = btrfs_wake_function; - - if (!btrfs_spin_on_block(eb)) - goto sleep; - - while(1) { - spin_nested(eb); - - /* nobody is blocking, exit with the spinlock held */ - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 0; - - /* - * we have the spinlock, but the real owner is blocking. - * wait for them - */ - spin_unlock(&eb->lock); - - /* - * spin for a bit, and if the blocking flag goes away, - * loop around - */ - cpu_relax(); - if (btrfs_spin_on_block(eb)) - continue; -sleep: - prepare_to_wait_exclusive(&eb->lock_wq, &wait, - TASK_UNINTERRUPTIBLE); - - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - schedule(); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; + } + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + atomic_dec(&eb->read_locks); + read_unlock(&eb->lock); +} - finish_wait(&eb->lock_wq, &wait); +/* + * drop a blocking read lock + */ +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) +{ + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } - return 0; + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->blocking_readers) == 0); + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) + wake_up(&eb->read_lock_wq); + atomic_dec(&eb->read_locks); } /* - * Very quick trylock, this does not spin or schedule. It returns - * 1 with the spinlock held if it was able to take the lock, or it - * returns zero if it was unable to take the lock. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * take a spinning write lock. This will wait for both + * blocking readers or writers */ -int btrfs_try_tree_lock(struct extent_buffer *eb) +void btrfs_tree_lock(struct extent_buffer *eb) { - if (spin_trylock(&eb->lock)) { - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - /* - * we've got the spinlock, but the real owner is - * blocking. Drop the spinlock and return failure - */ - spin_unlock(&eb->lock); - return 0; - } - return 1; +again: + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + wait_event(eb->read_lock_wq, + atomic_read(&eb->blocking_readers) == 0); + goto again; } - /* someone else has the spinlock giveup */ - return 0; + if (atomic_read(&eb->blocking_writers)) { + write_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + atomic_inc(&eb->write_locks); + eb->lock_owner = current->pid; } -int btrfs_tree_unlock(struct extent_buffer *eb) +/* + * drop a spinning or a blocking write lock. + */ +void btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * if we were a blocking owner, we don't have the spinlock held - * just clear the bit and look for waiters - */ - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - smp_mb__after_clear_bit(); - else - spin_unlock(&eb->lock); - - if (waitqueue_active(&eb->lock_wq)) - wake_up(&eb->lock_wq); - return 0; + int blockers = atomic_read(&eb->blocking_writers); + + BUG_ON(blockers > 1); + + btrfs_assert_tree_locked(eb); + eb->lock_owner = 0; + atomic_dec(&eb->write_locks); + + if (blockers) { + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_dec(&eb->blocking_writers); + smp_mb(); + if (waitqueue_active(&eb->write_lock_wq)) + wake_up(&eb->write_lock_wq); + } else { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + write_unlock(&eb->lock); + } } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - assert_spin_locked(&eb->lock); + BUG_ON(!atomic_read(&eb->write_locks)); +} + +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 6c4ce457168..b81e0e9a489 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,13 +19,42 @@ #ifndef __BTRFS_LOCKING_ #define __BTRFS_LOCKING_ -int btrfs_tree_lock(struct extent_buffer *eb); -int btrfs_tree_unlock(struct extent_buffer *eb); +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 -int btrfs_try_tree_lock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb); +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); -void btrfs_set_lock_blocking(struct extent_buffer *eb); -void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK_BLOCKING) + btrfs_tree_read_unlock_blocking(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +} #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 00000000000..dfad8514f0d --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,429 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/lzo.h> +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", + ret); + ret = -EIO; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) { + ret = -E2BIG; + goto out; + } + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + bool may_late_unmap, need_unmap; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + may_late_unmap = need_unmap = false; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + may_late_unmap = true; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + if (page_in_index + 1 >= total_pages_in) { + ret = -EIO; + goto done; + } + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "BTRFS: decompress failed\n"); + ret = -EIO; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "BTRFS: decompress failed!\n"); + ret = -EIO; + goto out; + } + + if (out_len < start_byte) { + ret = -EIO; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 00000000000..b7816cefbd1 --- /dev/null +++ b/fs/btrfs/math.h @@ -0,0 +1,44 @@ + +/* + * Copyright (C) 2012 Fujitsu. All rights reserved. + * Written by Miao Xie <miaox@cn.fujitsu.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include <asm/div64.h> + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0ebb2d..7187b14faa6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -24,6 +24,9 @@ #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" +#include "disk-io.h" + +static struct kmem_cache *btrfs_ordered_extent_cache; static u64 entry_end(struct btrfs_ordered_extent *entry) { @@ -59,6 +62,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +static void ordered_data_tree_panic(struct inode *inode, int errno, + u64 offset) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " + "%llu", offset); +} + /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -124,6 +135,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -132,7 +152,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { struct rb_root *root = &tree->tree; - struct rb_node *prev; + struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; @@ -161,64 +181,188 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio, int compress_type) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; tree = &BTRFS_I(inode)->ordered_tree; - entry = kzalloc(sizeof(*entry), GFP_NOFS); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) return -ENOMEM; entry->file_offset = file_offset; entry->start = start; entry->len = len; + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && + !(type == BTRFS_ORDERED_NOCOW)) + entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; - entry->inode = inode; + entry->inode = igrab(inode); + entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); + INIT_LIST_HEAD(&entry->log_list); + + trace_btrfs_ordered_extent_add(inode, entry); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); - BUG_ON(node); - spin_unlock(&tree->lock); + if (node) + ordered_data_tree_panic(inode, -EEXIST, file_offset); + spin_unlock_irq(&tree->lock); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, - &BTRFS_I(inode)->root->fs_info->ordered_extents); - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, + &root->fs_info->ordered_roots); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); - BUG_ON(node); return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum) +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); - return 0; + WARN_ON(entry->csum_bytes_left < sum->len); + entry->csum_bytes_left -= sum->len; + if (entry->csum_bytes_left == 0) + wake_up(&entry->wait); + spin_unlock_irq(&tree->lock); +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size, int uptodate) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + unsigned long flags; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irqsave(&tree->lock, flags); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordering dec_start %llu end %llu", dec_start, dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); + } + entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + if (entry->bytes_left == 0) { + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { + ret = 1; + } +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock_irqrestore(&tree->lock, flags); + return ret == 0; } /* @@ -232,15 +376,21 @@ int btrfs_add_ordered_sum(struct inode *inode, */ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; int ret; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -248,78 +398,178 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: if (!offset_in_entry(entry, file_offset)) { ret = 1; goto out; } if (io_size > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)io_size); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, io_size); } entry->bytes_left -= io_size; - if (entry->bytes_left == 0) + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_extent *ordered; + struct rb_node *n; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + if (!list_empty(&ordered->log_list)) + continue; + list_add_tail(&ordered->log_list, logged_list); + atomic_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + +void btrfs_put_logged_extents(struct list_head *logged_list) +{ + struct btrfs_ordered_extent *ordered; + + while (!list_empty(logged_list)) { + ordered = list_first_entry(logged_list, + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log) +{ + int index = log->log_transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + list_splice_tail(logged_list, &log->logged_list[index]); + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, + &ordered->flags)); + + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + /* * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped */ -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { struct list_head *cur; struct btrfs_ordered_sum *sum; + trace_btrfs_ordered_extent_put(entry->inode, entry); + if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); list_del(&sum->list); kfree(sum); } - kfree(entry); + kmem_cache_free(btrfs_ordered_extent_cache, entry); } - return 0; } /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. + * and waiters are woken up. */ -static int __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); - tree->last = NULL; + if (tree->last == node) + tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); - - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; + + trace_btrfs_ordered_extent_remove(inode, entry); /* * we have no more ordered extents for this inode and @@ -328,86 +578,112 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, */ if (RB_EMPTY_ROOT(&tree->tree) && !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); - return 0; + if (!root->nr_ordered_extents) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); + wake_up(&entry->wait); } -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -int btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) { - struct btrfs_ordered_inode_tree *tree; - int ret; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - ret = __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); - wake_up(&entry->wait); + struct btrfs_ordered_extent *ordered; - return ret; + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) { - struct list_head splice; - struct list_head *cur; - struct btrfs_ordered_extent *ordered; - struct inode *inode; + struct list_head splice, works; + struct btrfs_ordered_extent *ordered, *next; + int count = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); + + mutex_lock(&root->ordered_extent_mutex); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); + while (!list_empty(&splice) && nr) { + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); + atomic_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); - spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - cur = splice.next; - ordered = list_entry(cur, struct btrfs_ordered_extent, - root_extent_list); - if (nocow_only && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - list_move(&ordered->root_extent_list, - &root->fs_info->ordered_extents); - cond_resched_lock(&root->fs_info->ordered_extent_lock); - continue; - } + btrfs_init_work(&ordered->flush_work, + btrfs_run_ordered_extent_work, NULL, NULL); + list_add_tail(&ordered->work_list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &ordered->flush_work); - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); + cond_resched(); + spin_lock(&root->ordered_extent_lock); + if (nr != -1) + nr--; + count++; + } + list_splice_tail(&splice, &root->ordered_extents); + spin_unlock(&root->ordered_extent_lock); - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(ordered->inode); + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + mutex_unlock(&root->ordered_extent_mutex); - spin_unlock(&root->fs_info->ordered_extent_lock); + return count; +} - if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); - } else { - btrfs_put_ordered_extent(ordered); - } +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_root *root; + struct list_head splice; + int done; + + INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->ordered_extent_lock); + mutex_lock(&fs_info->ordered_operations_mutex); + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice) && nr) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + done = btrfs_wait_ordered_extents(root, nr); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->ordered_root_lock); + if (nr != -1) { + nr -= done; + WARN_ON(nr < 0); + } } - spin_unlock(&root->fs_info->ordered_extent_lock); - return 0; + list_splice_tail(&splice, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); } /* @@ -420,23 +696,26 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; + struct btrfs_transaction *cur_trans = trans->transaction; struct list_head splice; + struct list_head works; + struct btrfs_delalloc_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - + mutex_lock(&root->fs_info->ordered_extent_flush_mutex); + spin_lock(&root->fs_info->ordered_root_lock); + list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); - inode = &btrfs_inode->vfs_inode; list_del_init(&btrfs_inode->ordered_operations); @@ -445,31 +724,41 @@ again: * the inode may be getting freed (in sys_unlink path). */ inode = igrab(inode); + if (!inode) + continue; - if (!wait && inode) { + if (!wait) list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); - - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + &cur_trans->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); + + work = btrfs_alloc_delalloc_work(inode, wait, 1); + if (!work) { + spin_lock(&root->fs_info->ordered_root_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + list_splice_tail(&splice, + &cur_trans->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - - spin_unlock(&root->fs_info->ordered_extent_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; + spin_unlock(&root->fs_info->ordered_root_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); + return ret; } /* @@ -486,12 +775,15 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->len - 1; + trace_btrfs_ordered_extent_start(inode, entry); + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting - * for pdflush to find them + * for the flusher thread to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -503,11 +795,10 @@ void btrfs_start_ordered_extent(struct inode *inode, */ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { + int ret = 0; u64 end; u64 orig_end; - u64 wait_end; struct btrfs_ordered_extent *ordered; - int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -516,23 +807,39 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } - wait_end = orig_end; -again: + /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. + ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + orig_end); + if (ret) + return ret; + } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); + if (ret) + return ret; end = orig_end; - found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -541,24 +848,20 @@ again: btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len < start) { + if (ordered->file_offset + ordered->len <= start) { btrfs_put_ordered_extent(ordered); break; } - found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + ret = -EIO; btrfs_put_ordered_extent(ordered); - if (end == 0 || end == start) + if (ret || end == 0 || end == start) break; end--; } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } - return 0; + return ret; } /* @@ -573,7 +876,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -584,7 +887,48 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); + return entry; +} + +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock_irq(&tree->lock); return entry; } @@ -600,7 +944,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -608,7 +952,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -620,22 +964,24 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 disk_i_size; u64 new_i_size; - u64 i_size_test; u64 i_size = i_size_read(inode); struct rb_node *node; struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; int ret = 1; - if (ordered) + spin_lock_irq(&tree->lock); + if (ordered) { offset = entry_end(ordered); - else + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) + offset = min(offset, + ordered->file_offset + + ordered->truncated_len); + } else { offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - - spin_lock(&tree->lock); + } disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -649,18 +995,17 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) goto out; - } /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } + /* * walk backward from this ordered extent to disk_i_size. * if we find an ordered extent then we can't update disk i_size @@ -681,69 +1026,53 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } node = prev; } - while (node) { + for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; if (test->file_offset + test->len <= disk_i_size) break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) + if (entry_end(test) > disk_i_size) { + /* + * we don't update disk_i_size now, so record this + * undealt i_size. Or we will not know the real + * i_size. + */ + if (test->outstanding_isize < offset) + test->outstanding_isize = offset; + if (ordered && + ordered->outstanding_isize > + test->outstanding_isize) + test->outstanding_isize = + ordered->outstanding_isize; goto out; - node = rb_prev(node); + } } new_i_size = min_t(u64, offset, i_size); /* - * at this point, we know we can safely update i_size to at least - * the offset from this ordered extent. But, we need to - * walk forward and see if ios from higher up in the file have - * finished. + * Some ordered extents may completed before the current one, and + * we hold the real i_size in ->outstanding_isize. */ - if (ordered) { - node = rb_next(&ordered->rb_node); - } else { - if (prev) - node = rb_next(prev); - else - node = rb_first(&tree->tree); - } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) - i_size_test = test->file_offset; - } else { - i_size_test = i_size; - } - - /* - * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. - */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { - new_i_size = min_t(u64, i_size_test, i_size); - } + if (ordered && ordered->outstanding_isize > new_i_size) + new_i_size = min_t(u64, ordered->outstanding_isize, i_size); BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. */ if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); return ret; } @@ -753,39 +1082,42 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum) + u32 *sum, int len) { struct btrfs_ordered_sum *ordered_sum; - struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = BTRFS_I(inode)->root->sectorsize; - int ret = 1; + int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) - return 1; + return 0; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr) { - num_sectors = ordered_sum->len / sectorsize; - sector_sums = ordered_sum->sums; - for (i = 0; i < num_sectors; i++) { - if (sector_sums[i].bytenr == disk_bytenr) { - *sum = sector_sums[i].sum; - ret = 0; - goto out; - } - } + if (disk_bytenr >= ordered_sum->bytenr && + disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { + i = (disk_bytenr - ordered_sum->bytenr) >> + inode->i_sb->s_blocksize_bits; + num_sectors = ordered_sum->len >> + inode->i_sb->s_blocksize_bits; + num_sectors = min_t(int, len - index, num_sectors - i); + memcpy(sum + index, ordered_sum->sums + i, + num_sectors); + + index += (int)num_sectors; + if (index == len) + goto out; + disk_bytenr += num_sectors * sectorsize; } } out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); - return ret; + return index; } @@ -801,10 +1133,10 @@ out: * If trans is not null, we'll do a friendly check for a transaction that * is already flushing things and force the IO down ourselves. */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) { + struct btrfs_transaction *cur_trans = trans->transaction; u64 last_mod; last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -813,24 +1145,31 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * if this file hasn't been changed since the last transaction * commit, we can safely return without doing anything */ - if (last_mod < root->fs_info->last_trans_committed) - return 0; - - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return 0; - } + if (last_mod <= root->fs_info->last_trans_committed) + return; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); +} + +int __init ordered_data_init(void) +{ + btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", + sizeof(struct btrfs_ordered_extent), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_extent_cache) + return -ENOMEM; return 0; } + +void ordered_data_exit(void) +{ + if (btrfs_ordered_extent_cache) + kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a9f04..246897058ef 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree { struct rb_node *last; }; -/* - * these are used to collect checksums done just before bios submission. - * They are attached via a list into the ordered extent, and - * checksum items are inserted into the tree after all the blocks in - * the ordered extent are on disk - */ -struct btrfs_sector_sum { - /* bytenr on disk */ - u64 bytenr; - u32 sum; -}; - struct btrfs_ordered_sum { /* bytenr is the start of this extent on disk */ u64 bytenr; @@ -45,10 +33,10 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - unsigned long len; + int len; struct list_head list; - /* last field is a variable length array of btrfs_sector_sums */ - struct btrfs_sector_sum sums[]; + /* last field is a variable length array of csums */ + u32 sums[]; }; /* @@ -68,10 +56,21 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent + * has done its due diligence in updating + * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered + ordered extent */ +#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -88,9 +87,28 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* number of bytes that still need csumming */ + u64 csum_bytes_left; + + /* + * the end of the ordered extent which is behind it but + * didn't update disk_i_size. Please see the comment of + * btrfs_ordered_update_i_size(); + */ + u64 outstanding_isize; + + /* + * If we get truncated we need to adjust the file extent we enter for + * this ordered extent so that we do not expose stale data. + */ + u64 truncated_len; + /* flags (described above) */ unsigned long flags; + /* compression algorithm */ + int compress_type; + /* reference count */ atomic_t refs; @@ -100,6 +118,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* If we need to wait on this to be done */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -108,8 +129,13 @@ struct btrfs_ordered_extent { /* a per root list of all the pending ordered extents */ struct list_head root_extent_list; -}; + struct btrfs_work work; + + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; /* * calculates the total size you need to allocate for an ordered sum @@ -118,11 +144,8 @@ struct btrfs_ordered_extent { static inline int btrfs_ordered_sum_size(struct btrfs_root *root, unsigned long bytes) { - unsigned long num_sectors = (bytes + root->sectorsize - 1) / - root->sectorsize; - num_sectors++; - return sizeof(struct btrfs_ordered_sum) + - num_sectors * sizeof(struct btrfs_sector_sum); + int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); } static inline void @@ -133,17 +156,26 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); + u64 file_offset, u64 io_size, int uptodate); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size, + int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, @@ -151,13 +183,27 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum, int len); +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait); +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list); +void btrfs_put_logged_extents(struct list_head *logged_list); +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log); +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); +int __init ordered_data_init(void); +void ordered_data_exit(void); #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 79cba5fbc28..65793edb38c 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret < 0) goto out; + if (ret) { /* JDM: Really? */ + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); @@ -65,23 +69,3 @@ out: btrfs_free_path(path); return ret; } - -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - btrfs_free_path(path); - return ret; -} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0d126be22b6..9626b4ad3b9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -26,14 +26,12 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) int i; printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " "num_stripes %d\n", - (unsigned long long)btrfs_chunk_length(eb, chunk), - (unsigned long long)btrfs_chunk_owner(eb, chunk), - (unsigned long long)btrfs_chunk_type(eb, chunk), - num_stripes); + btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk), + btrfs_chunk_type(eb, chunk), num_stripes); for (i = 0 ; i < num_stripes ; i++) { printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, - (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), - (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + btrfs_stripe_devid_nr(eb, chunk, i), + btrfs_stripe_offset_nr(eb, chunk, i)); } } static void print_dev_item(struct extent_buffer *eb, @@ -41,22 +39,22 @@ static void print_dev_item(struct extent_buffer *eb, { printk(KERN_INFO "\t\tdev item devid %llu " "total_bytes %llu bytes used %llu\n", - (unsigned long long)btrfs_device_id(eb, dev_item), - (unsigned long long)btrfs_device_total_bytes(eb, dev_item), - (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); + btrfs_device_id(eb, dev_item), + btrfs_device_total_bytes(eb, dev_item), + btrfs_device_bytes_used(eb, dev_item)); } static void print_extent_data_ref(struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { printk(KERN_INFO "\t\textent data backref root %llu " "objectid %llu offset %llu count %u\n", - (unsigned long long)btrfs_extent_data_ref_root(eb, ref), - (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), - (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_root(eb, ref), + btrfs_extent_data_ref_objectid(eb, ref), + btrfs_extent_data_ref_offset(eb, ref), btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -65,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -87,19 +84,18 @@ static void print_extent_item(struct extent_buffer *eb, int slot) flags = btrfs_extent_flags(eb, ei); printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", - (unsigned long long)btrfs_extent_refs(eb, ei), - (unsigned long long)btrfs_extent_generation(eb, ei), - (unsigned long long)flags); + btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), + flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + printk(KERN_INFO "\t\ttree block key (%llu %u %llu) " "level %d\n", - (unsigned long long)btrfs_disk_key_objectid(&key), - key.type, - (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_disk_key_objectid(&key), key.type, + btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); iref = (struct btrfs_extent_inline_ref *)(info + 1); } else { @@ -115,11 +111,11 @@ static void print_extent_item(struct extent_buffer *eb, int slot) switch (type) { case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref " - "root %llu\n", (unsigned long long)offset); + "root %llu\n", offset); break; case BTRFS_SHARED_BLOCK_REF_KEY: printk(KERN_INFO "\t\tshared block backref " - "parent %llu\n", (unsigned long long)offset); + "parent %llu\n", offset); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -129,8 +125,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot) sref = (struct btrfs_shared_data_ref *)(iref + 1); printk(KERN_INFO "\t\tshared data backref " "parent %llu count %u\n", - (unsigned long long)offset, - btrfs_shared_data_ref_count(eb, sref)); + offset, btrfs_shared_data_ref_count(eb, sref)); break; default: BUG(); @@ -148,18 +143,36 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); printk("\t\textent back ref root %llu gen %llu " "owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root_v0(eb, ref0), - (unsigned long long)btrfs_ref_generation_v0(eb, ref0), - (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + btrfs_ref_root_v0(eb, ref0), + btrfs_ref_generation_v0(eb, ref0), + btrfs_ref_objectid_v0(eb, ref0), (unsigned long)btrfs_ref_count_v0(eb, ref0)); } #endif +static void print_uuid_item(struct extent_buffer *l, unsigned long offset, + u32 item_size) +{ + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("BTRFS: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + return; + } + while (item_size) { + __le64 subvol_id; + + read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); + printk(KERN_INFO "\t\tsubvol_id %llu\n", + (unsigned long long)le64_to_cpu(subvol_id)); + item_size -= sizeof(u64); + offset += sizeof(u64); + } +} + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,45 +185,46 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; - printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", - (unsigned long long)btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(root, l)); + if (!l) + return; + + nr = btrfs_header_nritems(l); + + btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", + btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(l, i); + item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = btrfs_key_type(&key); - printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", - i, - (unsigned long long)key.objectid, type, - (unsigned long long)key.offset, + i, key.objectid, type, key.offset, btrfs_item_offset(l, item), btrfs_item_size(l, item)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); printk(KERN_INFO "\t\tinode generation %llu size %llu " "mode %o\n", - (unsigned long long) btrfs_inode_generation(l, ii), - (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_size(l, ii), btrfs_inode_mode(l, ii)); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); printk(KERN_INFO "\t\tdir oid %llu type %u\n", - (unsigned long long)found_key.objectid, + found_key.objectid, btrfs_dir_type(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", - (unsigned long long) btrfs_disk_root_bytenr(l, ri), btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); @@ -236,22 +250,17 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) BTRFS_FILE_EXTENT_INLINE) { printk(KERN_INFO "\t\tinline extent data " "size %u\n", - btrfs_file_extent_inline_len(l, fi)); + btrfs_file_extent_inline_len(l, i, fi)); break; } printk(KERN_INFO "\t\textent data disk bytenr %llu " "nr %llu\n", - (unsigned long long) btrfs_file_extent_disk_bytenr(l, fi), - (unsigned long long) btrfs_file_extent_disk_num_bytes(l, fi)); printk(KERN_INFO "\t\textent data offset %llu " "nr %llu ram %llu\n", - (unsigned long long) btrfs_file_extent_offset(l, fi), - (unsigned long long) btrfs_file_extent_num_bytes(l, fi), - (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_EXTENT_REF_V0_KEY: @@ -260,11 +269,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) #else BUG(); #endif + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); printk(KERN_INFO "\t\tblock group used %llu\n", - (unsigned long long) btrfs_disk_block_group_used(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: @@ -281,14 +290,22 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" "\t\tchunk objectid %llu chunk offset %llu " "length %llu\n", - (unsigned long long) btrfs_dev_extent_chunk_tree(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_objectid(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_offset(l, dev_extent), - (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + break; + case BTRFS_DEV_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; + case BTRFS_DEV_REPLACE_KEY: + printk(KERN_INFO "\t\tdev replace\n"); + break; + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), + btrfs_item_size_nr(l, i)); + break; }; } } @@ -307,18 +324,14 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_print_leaf(root, c); return; } - printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", + btrfs_header_bytenr(c), level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", - i, - (unsigned long long)key.objectid, - key.type, - (unsigned long long)key.offset, - (unsigned long long)btrfs_node_blockptr(c, i)); + i, key.objectid, key.type, key.offset, + btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index da75efe534d..7faddfacc5b 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -19,5 +19,5 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c new file mode 100644 index 00000000000..129b1dd2852 --- /dev/null +++ b/fs/btrfs/props.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/hashtable.h> +#include "props.h" +#include "btrfs_inode.h" +#include "hash.h" +#include "transaction.h" +#include "xattr.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const char *value, size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + int inheritable; +}; + +static int prop_compression_validate(const char *value, size_t len); +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len); +static const char *prop_compression_extract(struct inode *inode); + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, + { + .xattr_name = NULL + } +}; + +void __init btrfs_props_init(void) +{ + struct prop_handler *p; + + hash_init(prop_handlers_ht); + + for (p = &prop_handlers[0]; p->xattr_name; p++) { + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } +} + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +static int __btrfs_set_prop(struct btrfs_trans_handle *trans, + struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = handler->validate(value, value_len); + if (ret) + return ret; + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + value, value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size_nr(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(inode), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(inode); + int ret; + + ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); + + return ret; +} + +static int inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *parent) +{ + const struct prop_handler *h; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (h = &prop_handlers[0]; h->xattr_name; h++) { + const char *value; + u64 num_bytes; + + if (!h->inheritable) + continue; + + value = h->extract(parent); + if (!value) + continue; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + ret = __btrfs_set_prop(trans, inode, h->xattr_name, + value, strlen(value), 0); + btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir) +{ + if (!dir) + return 0; + + return inherit_props(trans, inode, dir); +} + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root) +{ + struct btrfs_key key; + struct inode *parent_inode, *child_inode; + int ret; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, + parent_root, NULL); + if (IS_ERR(parent_inode)) + return PTR_ERR(parent_inode); + + child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(child_inode)) { + iput(parent_inode); + return PTR_ERR(child_inode); + } + + ret = inherit_props(trans, child_inode, parent_inode); + iput(child_inode); + iput(parent_inode); + + return ret; +} + +static int prop_compression_validate(const char *value, size_t len) +{ + if (!strncmp("lzo", value, len)) + return 0; + else if (!strncmp("zlib", value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len) +{ + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, len)) + type = BTRFS_COMPRESS_LZO; + else if (!strncmp("zlib", value, len)) + type = BTRFS_COMPRESS_ZLIB; + else + return -EINVAL; + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->force_compress) { + case BTRFS_COMPRESS_ZLIB: + return "zlib"; + case BTRFS_COMPRESS_LZO: + return "lzo"; + } + + return NULL; +} diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h new file mode 100644 index 00000000000..100f18829d5 --- /dev/null +++ b/fs/btrfs/props.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_PROPS_H +#define __BTRFS_PROPS_H + +#include "ctree.h" + +void __init btrfs_props_init(void); + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root); + +#endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c new file mode 100644 index 00000000000..98cb6b2630f --- /dev/null +++ b/fs/btrfs/qgroup.c @@ -0,0 +1,2651 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/btrfs.h> + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "ulist.h" +#include "backref.h" +#include "extent_io.h" +#include "qgroup.h" + +/* TODO XXX FIXME + * - subvol delete -> delete when ref goes to 0? delete limits also? + * - reorganize keys + * - compressed + * - sync + * - copy also limits on subvol creation + * - limit + * - caches fuer ulists + * - performance benchmarks + * - check all ioctl parameters + */ + +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + */ + u64 old_refcnt; + u64 new_refcnt; +}; + +/* + * glue structure to represent the relations between qgroups. + */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); + +/* must be called with qgroup_ioctl_lock held */ +static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node *n = fs_info->qgroup_tree.rb_node; + struct btrfs_qgroup *qgroup; + + while (n) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + if (qgroup->qgroupid < qgroupid) + n = n->rb_left; + else if (qgroup->qgroupid > qgroupid) + n = n->rb_right; + else + return qgroup; + } + return NULL; +} + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node **p = &fs_info->qgroup_tree.rb_node; + struct rb_node *parent = NULL; + struct btrfs_qgroup *qgroup; + + while (*p) { + parent = *p; + qgroup = rb_entry(parent, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < qgroupid) + p = &(*p)->rb_left; + else if (qgroup->qgroupid > qgroupid) + p = &(*p)->rb_right; + else + return qgroup; + } + + qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); + if (!qgroup) + return ERR_PTR(-ENOMEM); + + qgroup->qgroupid = qgroupid; + INIT_LIST_HEAD(&qgroup->groups); + INIT_LIST_HEAD(&qgroup->members); + INIT_LIST_HEAD(&qgroup->dirty); + + rb_link_node(&qgroup->node, parent, p); + rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + + return qgroup; +} + +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) +{ + struct btrfs_qgroup_list *list; + + list_del(&qgroup->dirty); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); + return 0; +} + +/* must be called with qgroup_lock held */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list = kzalloc(sizeof(*list), GFP_ATOMIC); + if (!list) + return -ENOMEM; + + list->group = parent; + list->member = member; + list_add_tail(&list->next_group, &member->groups); + list_add_tail(&list->next_member, &parent->members); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + +/* + * The full config is read in one go, only called from open_ctree() + * It doesn't use any locking, as at this point we're still single-threaded + */ +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path = NULL; + struct extent_buffer *l; + int slot; + int ret = 0; + u64 flags = 0; + u64 rescan_progress = 0; + + if (!fs_info->quota_enabled) + return 0; + + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* default this to quota off, in case no status key is found */ + fs_info->qgroup_flags = 0; + + /* + * pass 1: read status, all qgroup infos and limits + */ + key.objectid = 0; + key.type = 0; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); + if (ret) + goto out; + + while (1) { + struct btrfs_qgroup *qgroup; + + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { + struct btrfs_qgroup_status_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_status_item); + + if (btrfs_qgroup_status_version(l, ptr) != + BTRFS_QGROUP_STATUS_VERSION) { + btrfs_err(fs_info, + "old qgroup version, quota disabled"); + goto out; + } + if (btrfs_qgroup_status_generation(l, ptr) != + fs_info->generation) { + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_err(fs_info, + "qgroup generation mismatch, " + "marked as inconsistent"); + } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, + ptr); + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); + goto next1; + } + + if (found_key.type != BTRFS_QGROUP_INFO_KEY && + found_key.type != BTRFS_QGROUP_LIMIT_KEY) + goto next1; + + qgroup = find_qgroup_rb(fs_info, found_key.offset); + if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { + btrfs_err(fs_info, "inconsitent qgroup config"); + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + if (!qgroup) { + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + } + switch (found_key.type) { + case BTRFS_QGROUP_INFO_KEY: { + struct btrfs_qgroup_info_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_info_item); + qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); + qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); + qgroup->excl = btrfs_qgroup_info_excl(l, ptr); + qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); + /* generation currently unused */ + break; + } + case BTRFS_QGROUP_LIMIT_KEY: { + struct btrfs_qgroup_limit_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_limit_item); + qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); + qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); + qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); + qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); + qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); + break; + } + } +next1: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } + btrfs_release_path(path); + + /* + * pass 2: read all qgroup relations + */ + key.objectid = 0; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); + if (ret) + goto out; + while (1) { + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type != BTRFS_QGROUP_RELATION_KEY) + goto next2; + + if (found_key.objectid > found_key.offset) { + /* parent <- member, not needed to build config */ + /* FIXME should we omit the key completely? */ + goto next2; + } + + ret = add_relation_rb(fs_info, found_key.objectid, + found_key.offset); + if (ret == -ENOENT) { + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", + found_key.objectid, found_key.offset); + ret = 0; /* ignore the error */ + } + if (ret) + goto out; +next2: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } +out: + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) { + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } + btrfs_free_path(path); + + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + return ret < 0 ? ret : 0; +} + +/* + * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), + * first two are in single-threaded paths.And for the third one, we have set + * quota_root to be null with qgroup_lock held before, so it is safe to clean + * up the in-memory structures without qgroup_lock held. + */ +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + while ((n = rb_first(&fs_info->qgroup_tree))) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + rb_erase(n, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); + } + /* + * we call btrfs_free_qgroup_config() when umounting + * filesystem and disabling quota, so we set qgroup_ulit + * to be null here to avoid double free. + */ + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; +} + +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); +out: + btrfs_free_path(path); + return ret; +} + +static int add_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_qgroup_info_item *qgroup_info; + struct btrfs_qgroup_limit_item *qgroup_limit; + struct extent_buffer *leaf; + struct btrfs_key key; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); + + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_limit)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); + + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 qgroupid, + u64 flags, u64 max_rfer, u64 max_excl, + u64 rsv_rfer, u64 rsv_excl) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_limit_item *qgroup_limit; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_LIMIT_KEY; + key.offset = qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_info_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_info_item *qgroup_info; + int ret; + int slot; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); + btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); + btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); + btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_status_item(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_status_item *ptr; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_generation(l, ptr, trans->transid); + btrfs_set_qgroup_status_rescan(l, ptr, + fs_info->qgroup_rescan_progress.objectid); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called with qgroup_lock held + */ +static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf = NULL; + int ret; + int nr = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.offset = 0; + key.type = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) + break; + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + ret = 0; +out: + root->fs_info->pending_quota_state = 0; + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_path *path = NULL; + struct btrfs_qgroup_status_item *ptr; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_qgroup *qgroup = NULL; + int ret = 0; + int slot; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (fs_info->quota_root) { + fs_info->pending_quota_state = 1; + goto out; + } + + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + + /* + * initially create the quota tree + */ + quota_root = btrfs_create_tree(trans, fs_info, + BTRFS_QUOTA_TREE_OBJECTID); + if (IS_ERR(quota_root)) { + ret = PTR_ERR(quota_root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out_free_root; + } + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*ptr)); + if (ret) + goto out_free_path; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); + btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); + + btrfs_mark_buffer_dirty(leaf); + + key.objectid = 0; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = 0; + + btrfs_release_path(path); + ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); + if (ret > 0) + goto out_add_root; + if (ret < 0) + goto out_free_path; + + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.type == BTRFS_ROOT_REF_KEY) { + ret = add_qgroup_item(trans, quota_root, + found_key.offset); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + } + ret = btrfs_next_item(tree_root, path); + if (ret < 0) + goto out_free_path; + if (ret) + break; + } + +out_add_root: + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); +out_free_path: + btrfs_free_path(path); +out_free_root: + if (ret) { + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); + } +out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *quota_root; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + goto out; + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + quota_root = fs_info->quota_root; + fs_info->quota_root = NULL; + spin_unlock(&fs_info->qgroup_lock); + + btrfs_free_qgroup_config(fs_info); + + ret = btrfs_clean_quota_tree(trans, quota_root); + if (ret) + goto out; + + ret = btrfs_del_root(trans, tree_root, "a_root->root_key); + if (ret) + goto out; + + list_del("a_root->dirty_list); + + btrfs_tree_lock(quota_root->node); + clean_tree_block(trans, tree_root, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); +} + +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + ret = -EEXIST; + goto out; + } + } + + ret = add_qgroup_relation_item(trans, quota_root, src, dst); + if (ret) + goto out; + + ret = add_qgroup_relation_item(trans, quota_root, dst, src); + if (ret) { + del_qgroup_relation_item(trans, quota_root, src, dst); + goto out; + } + + spin_lock(&fs_info->qgroup_lock); + ret = add_relation_rb(quota_root->fs_info, src, dst); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; + int ret = 0; + int err; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) + goto exist; + } + ret = -ENOENT; + goto out; +exist: + ret = del_qgroup_relation_item(trans, quota_root, src, dst); + err = del_qgroup_relation_item(trans, quota_root, dst, src); + if (err && !ret) + ret = err; + + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + ret = -EEXIST; + goto out; + } + + ret = add_qgroup_item(trans, quota_root, qgroupid); + if (ret) + goto out; + + spin_lock(&fs_info->qgroup_lock); + qgroup = add_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + if (IS_ERR(qgroup)) + ret = PTR_ERR(qgroup); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } else { + /* check if there are no relations to this qgroup */ + if (!list_empty(&qgroup->groups) || + !list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + } + ret = del_qgroup_item(trans, quota_root, qgroupid); + + spin_lock(&fs_info->qgroup_lock); + del_qgroup_rb(quota_root->fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + ret = update_qgroup_limit_item(trans, quota_root, qgroupid, + limit->flags, limit->max_rfer, + limit->max_excl, limit->rsv_rfer, + limit->rsv_excl); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + qgroupid); + } + + spin_lock(&fs_info->qgroup_lock); + qgroup->lim_flags = limit->flags; + qgroup->max_rfer = limit->max_rfer; + qgroup->max_excl = limit->max_excl; + qgroup->rsv_rfer = limit->rsv_rfer; + qgroup->rsv_excl = limit->rsv_excl; + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + +/* + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. + */ +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) +{ + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; + + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) + return -ENOMEM; + + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); + + return 0; +} + +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; + + ulist_reinit(tmp); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + int ret; + + ulist_reinit(tmp); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + u64 cur_new_count, cur_old_count; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; + + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } + + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } + + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; + + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + + if (dirty) + qgroup_dirty(fs_info, qg); + } + return 0; +} + +/* + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. + */ +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } + } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + + return ret; +} + +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; + + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) + return 0; + } + + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ulist_free(qgroups); + return -ENOMEM; + } + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + seq = fs_info->qgroup_seq; + + /* + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... + */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; + + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; + + /* + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. + */ + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; + + /* + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. + */ + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; + + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); + ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } + return ret; +} + +/* + * called from commit_transaction. Writes all changed qgroups to disk. + */ +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + int ret = 0; + int start_rescan_worker = 0; + + if (!quota_root) + goto out; + + if (!fs_info->quota_enabled && fs_info->pending_quota_state) + start_rescan_worker = 1; + + fs_info->quota_enabled = fs_info->pending_quota_state; + + spin_lock(&fs_info->qgroup_lock); + while (!list_empty(&fs_info->dirty_qgroups)) { + struct btrfs_qgroup *qgroup; + qgroup = list_first_entry(&fs_info->dirty_qgroups, + struct btrfs_qgroup, dirty); + list_del_init(&qgroup->dirty); + spin_unlock(&fs_info->qgroup_lock); + ret = update_qgroup_info_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + spin_lock(&fs_info->qgroup_lock); + } + if (fs_info->quota_enabled) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; + else + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_status_item(trans, fs_info, quota_root); + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + + if (!ret && start_rescan_worker) { + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } + ret = 0; + } + +out: + + return ret; +} + +/* + * copy the acounting information between qgroups. This is necessary when a + * snapshot or a subvolume is created + */ +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit) +{ + int ret = 0; + int i; + u64 *i_qgroups; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *srcgroup; + struct btrfs_qgroup *dstgroup; + u32 level_size = 0; + u64 nums; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) + goto out; + + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + for (i = 0; i < nums; ++i) { + srcgroup = find_qgroup_rb(fs_info, *i_qgroups); + if (!srcgroup) { + ret = -EINVAL; + goto out; + } + ++i_qgroups; + } + } + + /* + * create a tracking group for the subvol itself + */ + ret = add_qgroup_item(trans, quota_root, objectid); + if (ret) + goto out; + + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + ret = update_qgroup_limit_item(trans, quota_root, objectid, + inherit->lim.flags, + inherit->lim.max_rfer, + inherit->lim.max_excl, + inherit->lim.rsv_rfer, + inherit->lim.rsv_excl); + if (ret) + goto out; + } + + if (srcid) { + struct btrfs_root *srcroot; + struct btrfs_key srckey; + int srcroot_level; + + srckey.objectid = srcid; + srckey.type = BTRFS_ROOT_ITEM_KEY; + srckey.offset = (u64)-1; + srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey); + if (IS_ERR(srcroot)) { + ret = PTR_ERR(srcroot); + goto out; + } + + rcu_read_lock(); + srcroot_level = btrfs_header_level(srcroot->node); + level_size = btrfs_level_size(srcroot, srcroot_level); + rcu_read_unlock(); + } + + /* + * add qgroup to all inherited groups + */ + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_qgroup_relation_item(trans, quota_root, + objectid, *i_qgroups); + if (ret) + goto out; + ret = add_qgroup_relation_item(trans, quota_root, + *i_qgroups, objectid); + if (ret) + goto out; + ++i_qgroups; + } + } + + + spin_lock(&fs_info->qgroup_lock); + + dstgroup = add_qgroup_rb(fs_info, objectid); + if (IS_ERR(dstgroup)) { + ret = PTR_ERR(dstgroup); + goto unlock; + } + + if (srcid) { + srcgroup = find_qgroup_rb(fs_info, srcid); + if (!srcgroup) + goto unlock; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; + srcgroup->excl = level_size; + srcgroup->excl_cmpr = level_size; + qgroup_dirty(fs_info, dstgroup); + qgroup_dirty(fs_info, srcgroup); + } + + if (!inherit) + goto unlock; + + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + ++i_qgroups; + } + + for (i = 0; i < inherit->num_ref_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->rfer = src->rfer - level_size; + dst->rfer_cmpr = src->rfer_cmpr - level_size; + i_qgroups += 2; + } + for (i = 0; i < inherit->num_excl_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->excl = src->excl + level_size; + dst->excl_cmpr = src->excl_cmpr + level_size; + i_qgroups += 2; + } + +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; +} + +/* + * reserve some space for a qgroup and all its parents. The reservation takes + * place with start_transaction or dealloc_reserve, similar to ENOSPC + * accounting. If not enough space is available, EDQUOT is returned. + * We assume that the requested space is new for all qgroups. + */ +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 ref_root = root->root_key.objectid; + int ret = 0; + struct ulist_node *unode; + struct ulist_iterator uiter; + + if (!is_fstree(ref_root)) + return 0; + + if (num_bytes == 0) + return 0; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + /* + * in a first step, we check all affected qgroups if any limits would + * be exceeded + */ + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qg->reserved + (s64)qg->rfer + num_bytes > + qg->max_rfer) { + ret = -EDQUOT; + goto out; + } + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qg->reserved + (s64)qg->excl + num_bytes > + qg->max_excl) { + ret = -EDQUOT; + goto out; + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; + /* + * no limits exceeded, now record the reservation into all qgroups + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + + qg = u64_to_ptr(unode->aux); + + qg->reserved += num_bytes; + } + +out: + spin_unlock(&fs_info->qgroup_lock); + return ret; +} + +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist_node *unode; + struct ulist_iterator uiter; + u64 ref_root = root->root_key.objectid; + int ret = 0; + + if (!is_fstree(ref_root)) + return; + + if (num_bytes == 0) + return; + + spin_lock(&fs_info->qgroup_lock); + + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + + qg->reserved -= num_bytes; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + +out: + spin_unlock(&fs_info->qgroup_lock); +} + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) +{ + if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) + return; + btrfs_err(trans->root->fs_info, + "qgroups not uptodate in trans handle %p: list is%s empty, " + "seq is %#x.%x", + trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); + BUG(); +} + +/* + * returns < 0 on error, 0 when more leafs are to be scanned. + * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + */ +static int +qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) +{ + struct btrfs_key found; + struct ulist *roots = NULL; + struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; + u64 seq; + int new_roots; + int slot; + int ret; + + path->leave_spinning = 1; + mutex_lock(&fs_info->qgroup_rescan_lock); + ret = btrfs_search_slot_for_read(fs_info->extent_root, + &fs_info->qgroup_rescan_progress, + path, 1, 0); + + pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", + fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.type, + fs_info->qgroup_rescan_progress.offset, ret); + + if (ret) { + /* + * The rescan is about to end, we will not be scanning any + * further blocks. We cannot unset the RESCAN flag here, because + * we want to commit the transaction if everything went well. + * To make the live accounting work in this phase, we set our + * scan progress pointer such that every real extent objectid + * will be smaller. + */ + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found, + btrfs_header_nritems(path->nodes[0]) - 1); + fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; + + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + slot = path->slots[0]; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) + continue; + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->extent_root->leafsize; + else + num_bytes = found.offset; + + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); + if (ret < 0) + goto out; + spin_lock(&fs_info->qgroup_lock); + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + } +out: + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + + return ret; +} + +static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; + struct extent_buffer *scratch_leaf = NULL; + int err = -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) + goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + goto out; + scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); + if (!scratch_leaf) + goto out; + + err = 0; + while (!err) { + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + if (!fs_info->quota_enabled) { + err = -EINTR; + } else { + err = qgroup_rescan_leaf(fs_info, path, trans, + qgroups, tmp, scratch_leaf); + } + if (err > 0) + btrfs_commit_transaction(trans, fs_info->fs_root); + else + btrfs_end_transaction(trans, fs_info->fs_root); + } + +out: + kfree(scratch_leaf); + ulist_free(qgroups); + ulist_free(tmp); + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + + if (err == 2 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } else if (err < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (err >= 0) { + btrfs_info(fs_info, "qgroup scan completed%s", + err == 2 ? " (inconsistency flag cleared)" : ""); + } else { + btrfs_err(fs_info, "qgroup scan failed with %d", err); + } + + complete_all(&fs_info->qgroup_rescan_completion); +} + +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) +{ + int ret = 0; + + if (!init_flags && + (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { + ret = -EINVAL; + goto err; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto err; + } + + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + memset(&fs_info->qgroup_rescan_progress, 0, + sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + init_completion(&fs_info->qgroup_rescan_completion); + + memset(&fs_info->qgroup_rescan_work, 0, + sizeof(fs_info->qgroup_rescan_work)); + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_worker, NULL, NULL); + + if (ret) { +err: + btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); + return ret; + } + + return 0; +} + +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + spin_lock(&fs_info->qgroup_lock); + /* clear all current qgroup tracking information */ + for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + qgroup->rfer = 0; + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; + } + spin_unlock(&fs_info->qgroup_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_join_transaction(fs_info->fs_root); + if (IS_ERR(trans)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } + + qgroup_rescan_zero_tracking(fs_info); + + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + + return 0; +} + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 00000000000..5952ff1fbd7 --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 00000000000..4a88f073fdd --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2097 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/hash.h> +#include <linux/list_sort.h> +#include <linux/raid/xor.h> +#include <linux/vmalloc.h> +#include <asm/div64.h> +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { + struct btrfs_fs_info *fs_info; + struct btrfs_bio *bbio; + + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; + + /* while we're doing rmw on a stripe + * we put it into a hash table so we can + * lock the stripe and merge more rbios + * into it. + */ + struct list_head hash_list; + + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + + /* + * for scheduling work in the helper threads + */ + struct btrfs_work work; + + /* + * bio list and bio_list_lock are used + * to add more bios into the stripe + * in hopes of avoiding the full rmw + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* also protected by the bio_list_lock, the + * plug list is used by the plugging code + * to collect partial bios while plugged. The + * stripe locking code also uses it to hand off + * the stripe lock to the next pending IO + */ + struct list_head plug_list; + + /* + * flags that tell us if it is safe to + * merge with this bio + */ + unsigned long flags; + + /* size of each individual stripe on disk */ + int stripe_len; + + /* number of data stripes (no p/q) */ + int nr_data; + + /* + * set if we're doing a parity rebuild + * for a read from higher up, which is handled + * differently from a parity rebuild as part of + * rmw + */ + int read_rebuild; + + /* first bad stripe */ + int faila; + + /* second bad stripe (for raid6 use) */ + int failb; + + /* + * number of pages needed to represent the full + * stripe + */ + int nr_pages; + + /* + * size of all the bios in the bio_list. This + * helps us decide if the rbio maps to a full + * stripe or not + */ + int bio_list_bytes; + + atomic_t refs; + + /* + * these are two arrays of pointers. We allocate the + * rbio big enough to hold them both and setup their + * locations when the rbio is allocated + */ + + /* pointers to pages that we allocated for + * reading/writing stripes directly from the disk (including P/Q) + */ + struct page **stripe_pages; + + /* + * pointers to the pages in the bio_list. Stored + * here for faster lookup + */ + struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + int table_size; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table_size = sizeof(*table) + sizeof(*h) * num_entries; + table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!table) { + table = vzalloc(table_size); + if (!table) + return -ENOMEM; + } + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + init_waitqueue_head(&cur->wait); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + if (x) { + if (is_vmalloc_addr(x)) + vfree(x); + else + kfree(x); + } + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->raid_map[0]; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + if (is_vmalloc_addr(info->stripe_hash_table)) + vfree(info->stripe_hash_table); + else + kfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + if (size != rbio->nr_data * rbio->stripe_len) + ret = 0; + + BUG_ON(size > rbio->nr_data * rbio->stripe_len); + return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + ret = __rbio_is_full(rbio); + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->raid_map[0] != + cur->raid_map[0]) + return 0; + + /* reads can't merge with writes */ + if (last->read_rebuild != + cur->read_rebuild) { + return 0; + } + + return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + return NULL; + + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> + PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + unsigned long flags; + DEFINE_WAIT(wait); + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + int walk = 0; + + spin_lock_irqsave(&h->lock, flags); + list_for_each_entry(cur, &h->hash_list, hash_list) { + walk++; + if (cur->raid_map[0] == rbio->raid_map[0]) { + spin_lock(&cur->bio_list_lock); + + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * we couldn't merge with the running + * rbio, see if we can merge with the + * pending ones. We don't have to + * check for rmw_locked because there + * is no way they are inside finish_rmw + * right now + */ + list_for_each_entry(pending, &cur->plug_list, + plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* no merging, put us on the tail of the plug list, + * our rbio will be started with the currently + * running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } + } +lockit: + atomic_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + __free_raid_bio(freeit); + return ret; +} + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock_irqsave(&h->lock, flags); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + atomic_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + + if (next->read_rebuild) + async_read_rebuild(next); + else { + steal_rbio(rbio, next); + async_rmw_stripe(next); + } + + goto done_nolock; + } else if (waitqueue_active(&h->wait)) { + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + wake_up(&h->wait); + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + WARN_ON(atomic_read(&rbio->refs) < 0); + if (!atomic_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + kfree(rbio->raid_map); + kfree(rbio->bbio); + kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + unlock_stripe(rbio); + __free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *next; + free_raid_bio(rbio); + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + if (uptodate) + set_bit(BIO_UPTODATE, &cur->bi_flags); + bio_endio(cur, err); + cur = next; + } +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); + return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else. This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, + int index, int pagenr, int bio_list_only) +{ + int chunk_page; + struct page *p = NULL; + + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + + spin_lock_irq(&rbio->bio_list_lock); + p = rbio->bio_pages[chunk_page]; + spin_unlock_irq(&rbio->bio_list_lock); + + if (p || bio_list_only) + return p; + + return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ + unsigned long nr = stripe_len * nr_stripes; + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + int nr_data = 0; + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + void *p; + + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + GFP_NOFS); + if (!rbio) { + kfree(raid_map); + kfree(bbio); + return ERR_PTR(-ENOMEM); + } + + bio_list_init(&rbio->bio_list); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + rbio->bbio = bbio; + rbio->raid_map = raid_map; + rbio->fs_info = root->fs_info; + rbio->stripe_len = stripe_len; + rbio->nr_pages = num_pages; + rbio->faila = -1; + rbio->failb = -1; + atomic_set(&rbio->refs, 1); + + /* + * the stripe_pages and bio_pages array point to the extra + * memory we allocated past the end of the rbio + */ + p = rbio + 1; + rbio->stripe_pages = p; + rbio->bio_pages = p + sizeof(struct page *) * num_pages; + + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + nr_data = bbio->num_stripes - 2; + else + nr_data = bbio->num_stripes - 1; + + rbio->nr_data = nr_data; + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + ClearPageUptodate(page); + } + return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +static int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + test_bit(BIO_UPTODATE, &last->bi_flags) && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); + if (ret == PAGE_CACHE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_iter.bi_sector = disk_start >> 9; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ + int index; + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); + index += page; + return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + struct page *p; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_iter.bi_sector << 9; + stripe_offset = start - rbio->raid_map[0]; + page_index = stripe_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + rbio->bio_pages[page_index + i] = p; + } + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[bbio->num_stripes]; + int stripe_len = rbio->stripe_len; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct bio_list bio_list; + struct bio *bio; + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; + int ret; + + bio_list_init(&bio_list); + + if (bbio->num_stripes - rbio->nr_data == 1) { + p_stripe = bbio->num_stripes - 1; + } else if (bbio->num_stripes - rbio->nr_data == 2) { + p_stripe = bbio->num_stripes - 2; + q_stripe = bbio->num_stripes - 1; + } else { + BUG(); + } + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->bbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *p; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + p = rbio_pstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + p = rbio_qstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + + for (stripe = 0; stripe < bbio->num_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio. Used to figure out which + * stripe has failed. This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 physical = bio->bi_iter.bi_sector; + u64 stripe_start; + int i; + struct btrfs_bio_stripe *stripe; + + physical <<= 9; + + for (i = 0; i < rbio->bbio->num_stripes; i++) { + stripe = &rbio->bbio->stripes[i]; + stripe_start = stripe->physical; + if (physical >= stripe_start && + physical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping). Used to figure out which stripe has + * failed. This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 logical = bio->bi_iter.bi_sector; + u64 stripe_start; + int i; + + logical <<= 9; + + for (i = 0; i < rbio->nr_data; i++) { + stripe_start = rbio->raid_map[i]; + if (logical >= stripe_start && + logical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + + /* we already know this stripe is bad, move on */ + if (rbio->faila == failed || rbio->failb == failed) + goto out; + + if (rbio->faila == -1) { + /* first failure on this rbio */ + rbio->faila = failed; + atomic_inc(&rbio->bbio->error); + } else if (rbio->failb == -1) { + /* second failure on this rbio */ + rbio->failb = failed; + atomic_inc(&rbio->bbio->error); + } else { + ret = -EIO; + } +out: + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + + return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + int failed = find_bio_stripe(rbio, bio); + + if (failed < 0) + return -EIO; + + return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ + int i; + struct page *p; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + SetPageUptodate(p); + } +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + goto cleanup; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_rmw(rbio); + return; + +cleanup: + + rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->bbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_rmw_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return 0; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; + +finish: + validate_rbio_for_rmw(rbio); + return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = alloc_rbio_parity_pages(rbio); + if (ret) { + __free_raid_bio(rbio); + return ret; + } + + ret = lock_stripe_add(rbio); + if (ret == 0) + finish_rmw(rbio); + return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + async_rmw_stripe(rbio); + return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe. So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ + /* head off into rmw land if we don't have a full stripe */ + if (!rbio_is_full(rbio)) + return partial_stripe_write(rbio); + return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + /* + * sort our plug list then try to merge + * everything we can in hopes of creating full + * stripes. + */ + list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* we have a full stripe, send it down */ + full_stripe_write(cur); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + __free_raid_bio(cur); + continue; + + } + __raid56_parity_write(last); + } + last = cur; + } + if (last) { + __raid56_parity_write(last); + } + kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ + struct btrfs_plug_cb *plug; + plug = container_of(work, struct btrfs_plug_cb, work); + run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug; + plug = container_of(cb, struct btrfs_plug_cb, cb); + + if (from_schedule) { + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_queue_work(plug->info->rmw_workers, + &plug->work); + return; + } + run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) + return PTR_ERR(rbio); + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_iter.bi_size; + + /* + * don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (rbio_is_full(rbio)) + return full_stripe_write(rbio); + + cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, + sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = root->fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + } else { + return __raid56_parity_write(rbio); + } + return 0; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int pagenr, stripe; + void **pointers; + int faila = -1, failb = -1; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + int i; + + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + GFP_NOFS); + if (!pointers) { + err = -ENOMEM; + goto cleanup_io; + } + + faila = rbio->faila; + failb = rbio->failb; + + if (rbio->read_rebuild) { + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* setup our array of pointers with pages + * from each stripe + */ + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + pointers[stripe] = kmap(page); + } + + /* all raid6 handling here */ + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == + RAID6_Q_STRIPE) { + + /* + * single failure, rebuild from parity raid5 + * style + */ + if (failb < 0) { + if (faila == rbio->nr_data) { + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * TODO, we should redo the xor here. + */ + err = -EIO; + goto cleanup; + } + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* make sure our ps and qs are in order */ + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + + /* if the q stripe is failed, do a pstripe reconstruction + * from the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want + */ + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + err = -EIO; + goto cleanup; + } + /* + * otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, pointers); + } else { + raid6_2data_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, failb, + pointers); + } + } else { + void *p; + + /* rebuild from P stripe here (raid5 or raid6) */ + BUG_ON(failb != -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], + pointers[rbio->nr_data], + PAGE_CACHE_SIZE); + + /* rearrange the pointer array */ + p = pointers[faila]; + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) + pointers[stripe] = pointers[stripe + 1]; + pointers[rbio->nr_data - 1] = p; + + /* xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + } + /* if we're doing this rebuild as part of an rmw, go through + * and set all of our private rbio pages in the + * failed stripes as uptodate. This way finish_rmw will + * know they can be trusted. If this was a read reconstruction, + * other endio functions will fiddle the uptodate bits + */ + if (!rbio->read_rebuild) { + for (i = 0; i < nr_pages; i++) { + if (faila != -1) { + page = rbio_stripe_page(rbio, faila, i); + SetPageUptodate(page); + } + if (failb != -1) { + page = rbio_stripe_page(rbio, failb, i); + SetPageUptodate(page); + } + } + } + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + kunmap(page); + } + } + + err = 0; +cleanup: + kfree(pointers); + +cleanup_io: + + if (rbio->read_rebuild) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + rbio_orig_end_io(rbio, err, err == 0); + } else if (err == 0) { + rbio->faila = -1; + rbio->failb = -1; + finish_rmw(rbio); + } else { + rbio_orig_end_io(rbio, err, 0); + } +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + /* + * we only read stripe pages off the disk, set them + * up to date if there were no errors + */ + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + rbio_orig_end_io(rbio, -EIO, 0); + else + __raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->bbio->error, 0); + + /* + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); + continue; + } + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *p; + + /* + * the rmw code may have already read this + * page in + */ + p = rbio_stripe_page(rbio, stripe, pagenr); + if (PageUptodate(p)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); + if (ret < 0) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * we might have no bios to read just because the pages + * were up to date, or we might have no bios to read because + * the devices were gone. + */ + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + __raid_recover_end_io(rbio); + goto out; + } else { + goto cleanup; + } + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_recover_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } +out: + return 0; + +cleanup: + if (rbio->read_rebuild) + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) + return PTR_ERR(rbio); + + rbio->read_rebuild = 1; + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_iter.bi_size; + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(raid_map); + kfree(bbio); + kfree(rbio); + return -EIO; + } + + /* + * reconstruct from the q stripe if they are + * asking for mirror 3 + */ + if (mirror_num == 3) + rbio->failb = bbio->num_stripes - 2; + + ret = lock_stripe_add(rbio); + + /* + * __raid56_parity_recover will end the bio with + * any errors it hits. We don't want to return + * its error value up the stack because our caller + * will end up calling bio_endio with any nonzero + * return + */ + if (ret == 0) + __raid56_parity_recover(rbio); + /* + * our rbio has been added to the list of + * rbios that will be handled after the + * currently lock owner is done + */ + return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 00000000000..ea5d73bfdfb --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 00000000000..9e111e4576d --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { + struct rcu_head rcu; + char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + strncpy(ret->str, src, len); + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 00000000000..09230cf3a24 --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,994 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" +#include "dev-replace.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + u32 blocksize; + int err; + struct list_head extctl; + int refcnt; + spinlock_t lock; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + btrfs_debug(root->fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_bio *bbio) +{ + int ret; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < bbio->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = bbio->stripes[i].dev; + } + zone->ndevs = bbio->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), + zone); + + if (ret == -EEXIST) { + kfree(zone); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + } + spin_unlock(&fs_info->reada_lock); + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *dev; + struct btrfs_device *prev_dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; + + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + + if (re) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = btrfs_level_size(root, level); + re->logical = logical; + re->blocksize = blocksize; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + re->refcnt = 1; + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0); + if (ret || !bbio || length < blocksize) + goto error; + + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { + btrfs_err(root->fs_info, + "readahead: more than %d copies not supported", + BTRFS_MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + re_exist->refcnt++; + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + if (ret) { + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); + for (i = 0; i < nzones; ++i) { + dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + if (!dev->bdev) { + /* + * cannot read ahead on missing device, but for RAID5/6, + * REQ_GET_READ_MIRRORS return 1. So don't skip missing + * device for such case. + */ + if (nzones > 1) + continue; + } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } + prev_dev = dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = bbio->stripes[i].dev; + BUG_ON(dev == NULL); + /* ignore whether the entry was inserted */ + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); + + kfree(bbio); + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + kfree(bbio); + kfree(re); + return re_exist; +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (--re->refcnt) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + u32 blocksize; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + re->blocksize; + re->refcnt++; + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + blocksize = re->blocksize; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + int old_ioprio; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), + task_nice_ioprio(current)); + set_task_ioprio(current, BTRFS_IOPRIO_READA); + __reada_start_machine(fs_info); + set_task_ioprio(current, old_ioprio); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); + rmw->fs_info = fs_info; + + btrfs_queue_work(fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, re->blocksize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, re->blocksize, list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + if (reada_add_block(rc, start, &max_key, level, generation)) { + kfree(rc); + return ERR_PTR(-ENOMEM); + } + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, + atomic_read(&rc->elems) < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c deleted file mode 100644 index a97314cf6bd..00000000000 --- a/fs/btrfs/ref-cache.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include "ctree.h" -#include "ref-cache.h" -#include "transaction.h" - -/* - * leaf refs are used to cache the information about which extents - * a given leaf has references on. This allows us to process that leaf - * in btrfs_drop_snapshot without needing to read it back from disk. - */ - -/* - * kmalloc a leaf reference struct and update the counters for the - * total ref cache size - */ -struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, - int nr_extents) -{ - struct btrfs_leaf_ref *ref; - size_t size = btrfs_leaf_ref_size(nr_extents); - - ref = kmalloc(size, GFP_NOFS); - if (ref) { - spin_lock(&root->fs_info->ref_cache_lock); - root->fs_info->total_ref_cache_size += size; - spin_unlock(&root->fs_info->ref_cache_lock); - - memset(ref, 0, sizeof(*ref)); - atomic_set(&ref->usage, 1); - INIT_LIST_HEAD(&ref->list); - } - return ref; -} - -/* - * free a leaf reference struct and update the counters for the - * total ref cache size - */ -void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) -{ - if (!ref) - return; - WARN_ON(atomic_read(&ref->usage) == 0); - if (atomic_dec_and_test(&ref->usage)) { - size_t size = btrfs_leaf_ref_size(ref->nritems); - - BUG_ON(ref->in_tree); - kfree(ref); - - spin_lock(&root->fs_info->ref_cache_lock); - root->fs_info->total_ref_cache_size -= size; - spin_unlock(&root->fs_info->ref_cache_lock); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_leaf_ref *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } - - entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct btrfs_leaf_ref *entry; - - while (n) { - entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); - WARN_ON(!entry->in_tree); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} - -int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, - int shared) -{ - struct btrfs_leaf_ref *ref = NULL; - struct btrfs_leaf_ref_tree *tree = root->ref_tree; - - if (shared) - tree = &root->fs_info->shared_ref_tree; - if (!tree) - return 0; - - spin_lock(&tree->lock); - while (!list_empty(&tree->list)) { - ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); - BUG_ON(ref->tree != tree); - if (ref->root_gen > max_root_gen) - break; - if (!xchg(&ref->in_tree, 0)) { - cond_resched_lock(&tree->lock); - continue; - } - - rb_erase(&ref->rb_node, &tree->root); - list_del_init(&ref->list); - - spin_unlock(&tree->lock); - btrfs_free_leaf_ref(root, ref); - cond_resched(); - spin_lock(&tree->lock); - } - spin_unlock(&tree->lock); - return 0; -} - -/* - * find the leaf ref for a given extent. This returns the ref struct with - * a usage reference incremented - */ -struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, - u64 bytenr) -{ - struct rb_node *rb; - struct btrfs_leaf_ref *ref = NULL; - struct btrfs_leaf_ref_tree *tree = root->ref_tree; -again: - if (tree) { - spin_lock(&tree->lock); - rb = tree_search(&tree->root, bytenr); - if (rb) - ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); - if (ref) - atomic_inc(&ref->usage); - spin_unlock(&tree->lock); - if (ref) - return ref; - } - if (tree != &root->fs_info->shared_ref_tree) { - tree = &root->fs_info->shared_ref_tree; - goto again; - } - return NULL; -} - -/* - * add a fully filled in leaf ref struct - * remove all the refs older than a given root generation - */ -int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, - int shared) -{ - int ret = 0; - struct rb_node *rb; - struct btrfs_leaf_ref_tree *tree = root->ref_tree; - - if (shared) - tree = &root->fs_info->shared_ref_tree; - - spin_lock(&tree->lock); - rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); - if (rb) { - ret = -EEXIST; - } else { - atomic_inc(&ref->usage); - ref->tree = tree; - ref->in_tree = 1; - list_add_tail(&ref->list, &tree->list); - } - spin_unlock(&tree->lock); - return ret; -} - -/* - * remove a single leaf ref from the tree. This drops the ref held by the tree - * only - */ -int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) -{ - struct btrfs_leaf_ref_tree *tree; - - if (!xchg(&ref->in_tree, 0)) - return 0; - - tree = ref->tree; - spin_lock(&tree->lock); - - rb_erase(&ref->rb_node, &tree->root); - list_del_init(&ref->list); - - spin_unlock(&tree->lock); - - btrfs_free_leaf_ref(root, ref); - return 0; -} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h deleted file mode 100644 index e2a55cb2072..00000000000 --- a/fs/btrfs/ref-cache.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#ifndef __REFCACHE__ -#define __REFCACHE__ - -struct btrfs_extent_info { - /* bytenr and num_bytes find the extent in the extent allocation tree */ - u64 bytenr; - u64 num_bytes; - - /* objectid and offset find the back reference for the file */ - u64 objectid; - u64 offset; -}; - -struct btrfs_leaf_ref { - struct rb_node rb_node; - struct btrfs_leaf_ref_tree *tree; - int in_tree; - atomic_t usage; - - u64 root_gen; - u64 bytenr; - u64 owner; - u64 generation; - int nritems; - - struct list_head list; - struct btrfs_extent_info extents[]; -}; - -static inline size_t btrfs_leaf_ref_size(int nr_extents) -{ - return sizeof(struct btrfs_leaf_ref) + - sizeof(struct btrfs_extent_info) * nr_extents; -} - -static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) -{ - tree->root = RB_ROOT; - INIT_LIST_HEAD(&tree->list); - spin_lock_init(&tree->lock); -} - -static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) -{ - return RB_EMPTY_ROOT(&tree->root); -} - -void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); -struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, - int nr_extents); -void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); -struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, - u64 bytenr); -int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, - int shared); -int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, - int shared); -int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); -#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e558dd941de..65245a07275 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -29,6 +29,8 @@ #include "locking.h" #include "btrfs_inode.h" #include "async-thread.h" +#include "free-space-cache.h" +#include "inode-map.h" /* * backref_node, mapping_node and tree_block start with this @@ -44,8 +46,12 @@ struct tree_entry { struct backref_node { struct rb_node rb_node; u64 bytenr; - /* objectid tree block owner */ + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; /* list of upper level blocks reference this block */ struct list_head upper; /* list of child blocks in the cache */ @@ -56,9 +62,9 @@ struct backref_node { struct extent_buffer *eb; /* level of tree block */ unsigned int level:8; - /* 1 if the block is root of old snapshot */ - unsigned int old_root:1; - /* 1 if no child blocks in the cache */ + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ unsigned int lowest:1; /* is the extent buffer locked */ unsigned int locked:1; @@ -66,6 +72,16 @@ struct backref_node { unsigned int processed:1; /* have backrefs of this block been checked */ unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; }; /* @@ -74,18 +90,34 @@ struct backref_node { struct backref_edge { struct list_head list[2]; struct backref_node *node[2]; - u64 blockptr; }; #define LOWER 0 #define UPPER 1 +#define RELOCATION_RESERVED_NODES 256 struct backref_cache { /* red black tree of all backref nodes in the cache */ struct rb_root rb_root; - /* list of backref nodes with no child block in the cache */ + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ struct list_head pending[BTRFS_MAX_LEVEL]; - spinlock_t lock; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; }; /* @@ -113,15 +145,6 @@ struct tree_block { unsigned int key_ready:1; }; -/* inode vector */ -#define INODEVEC_SIZE 16 - -struct inodevec { - struct list_head list; - struct inode *inode[INODEVEC_SIZE]; - int nr; -}; - #define MAX_EXTENTS 128 struct file_extent_cluster { @@ -138,36 +161,42 @@ struct reloc_control { struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; - struct btrfs_workers workers; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; + u64 search_start; u64 extents_found; - u64 extents_skipped; - int stage; - int create_reloc_root; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int found_old_snapshot:1; }; /* stages of data relocation */ #define MOVE_DATA_EXTENTS 0 #define UPDATE_DATA_PTRS 1 -/* - * merge reloc tree to corresponding fs tree in worker threads - */ -struct async_merge { - struct btrfs_work work; - struct reloc_control *rc; - struct btrfs_root *root; - struct completion *done; - atomic_t *num_pending; -}; +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); static void mapping_tree_init(struct mapping_tree *tree) { @@ -181,15 +210,80 @@ static void backref_cache_init(struct backref_cache *cache) cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } } -static void backref_node_init(struct backref_node *node) +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) { - memset(node, 0, sizeof(*node)); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } } static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, @@ -234,6 +328,18 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } +static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) +{ + + struct btrfs_fs_info *fs_info = NULL; + struct backref_node *bnode = rb_entry(rb_node, struct backref_node, + rb_node); + if (bnode->root) + fs_info = bnode->root->fs_info; + btrfs_panic(fs_info, errno, "Inconsistency in backref cache " + "found at offset %llu", bytenr); +} + /* * walk up backref nodes until reach node presents tree root */ @@ -250,6 +356,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node, edges[idx++] = edge; node = edge->node[UPPER]; } + BUG_ON(node->detached); *index = idx; return node; } @@ -281,13 +388,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], return NULL; } +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + static void drop_node_buffer(struct backref_node *node) { if (node->eb) { - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } + unlock_node_buffer(node); free_extent_buffer(node->eb); node->eb = NULL; } @@ -296,14 +408,14 @@ static void drop_node_buffer(struct backref_node *node) static void drop_backref_node(struct backref_cache *tree, struct backref_node *node) { - BUG_ON(!node->lowest); BUG_ON(!list_empty(&node->upper)); drop_node_buffer(node); + list_del(&node->list); list_del(&node->lower); - - rb_erase(&node->rb_node, &tree->rb_root); - kfree(node); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); } /* @@ -318,27 +430,122 @@ static void remove_backref_node(struct backref_cache *cache, if (!node) return; - BUG_ON(!node->lowest); + BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct backref_edge, list[LOWER]); upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); - kfree(edge); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } /* - * add the node to pending list if no other + * add the node to leaf node list if no other * child block cached. */ if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, - &cache->pending[upper->level]); + list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } } + drop_backref_node(cache, node); } +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, bytenr); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} /* * find reloc tree by address of tree root */ @@ -366,7 +573,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } @@ -383,7 +592,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, else key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(fs_info, &key); + return btrfs_get_fs_root(fs_info, &key, false); } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -401,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, root_objectid); BUG_ON(IS_ERR(root)); - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && generation != btrfs_root_generation(&root->root_item)) return NULL; @@ -413,10 +622,13 @@ static noinline_for_stack int find_inline_backref(struct extent_buffer *leaf, int slot, unsigned long *ptr, unsigned long *end) { + struct btrfs_key key; struct btrfs_extent_item *ei; struct btrfs_tree_block_info *bi; u32 item_size; + btrfs_item_key_to_cpu(leaf, &key, slot); + item_size = btrfs_item_size_nr(leaf, slot); #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { @@ -428,13 +640,23 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, WARN_ON(!(btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - if (item_size <= sizeof(*ei) + sizeof(*bi)) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + item_size <= sizeof(*ei) + sizeof(*bi)) { WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); return 1; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + item_size <= sizeof(*ei)) { + WARN_ON(item_size < sizeof(*ei)); + return 1; + } - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + } else { + *ptr = (unsigned long)(ei + 1); + } *end = (unsigned long)ei + item_size; return 0; } @@ -453,11 +675,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * for all upper level blocks that directly/indirectly reference the * block are also cached. */ -static struct backref_node *build_backref_tree(struct reloc_control *rc, - struct backref_cache *cache, - struct btrfs_key *node_key, - int level, u64 bytenr) +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) { + struct backref_cache *cache = &rc->backref_cache; struct btrfs_path *path1; struct btrfs_path *path2; struct extent_buffer *eb; @@ -473,8 +696,11 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, unsigned long end; unsigned long ptr; LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; int ret; int err = 0; + bool need_check = true; path1 = btrfs_alloc_path(); path2 = btrfs_alloc_path(); @@ -482,16 +708,16 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } + path1->reada = 1; + path2->reada = 2; - node = kmalloc(sizeof(*node), GFP_NOFS); + node = alloc_backref_node(cache); if (!node) { err = -ENOMEM; goto out; } - backref_node_init(node); node->bytenr = bytenr; - node->owner = 0; node->level = level; node->lowest = 1; cur = node; @@ -499,7 +725,7 @@ again: end = 0; ptr = 0; key.objectid = cur->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; + key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; path1->search_commit_root = 1; @@ -517,7 +743,7 @@ again: WARN_ON(cur->checked); if (!list_empty(&cur->upper)) { /* - * the backref was added previously when processsing + * the backref was added previously when processing * backref of type BTRFS_TREE_BLOCK_REF_KEY */ BUG_ON(!list_is_singular(&cur->upper)); @@ -557,7 +783,8 @@ again: break; } - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { ret = find_inline_backref(eb, path1->slots[0], &ptr, &end); if (ret) @@ -587,17 +814,21 @@ again: #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.objectid == key.offset && - key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { struct btrfs_extent_ref_v0 *ref0; ref0 = btrfs_item_ptr(eb, path1->slots[0], struct btrfs_extent_ref_v0); - root = find_tree_root(rc, eb, ref0); - if (root) - cur->root = root; - else - cur->old_root = 1; - break; + if (key.objectid == key.offset) { + root = find_tree_root(rc, eb, ref0); + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } + if (is_cowonly_root(btrfs_ref_root_v0(eb, + ref0))) + cur->cowonly = 1; } #else BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); @@ -614,22 +845,20 @@ again: break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; } rb_node = tree_search(&cache->rb_root, key.offset); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = key.offset; - upper->owner = 0; upper->level = cur->level + 1; /* * backrefs for the upper level block isn't @@ -639,11 +868,12 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); + BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - list_add(&edge->list[LOWER], &cur->upper); - edge->node[UPPER] = upper; + list_add_tail(&edge->list[LOWER], &cur->upper); edge->node[LOWER] = cur; + edge->node[UPPER] = upper; goto next; } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { @@ -657,11 +887,17 @@ again: goto out; } + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + cur->cowonly = 1; + if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ BUG_ON(btrfs_root_bytenr(&root->root_item) != cur->bytenr); - cur->root = root; + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; break; } @@ -688,15 +924,19 @@ again: cur->bytenr); lower = cur; + need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != lower->bytenr); - lower->root = root; + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; @@ -705,16 +945,18 @@ again: eb = path2->nodes[level]; rb_node = tree_search(&cache->rb_root, eb->start); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) + upper->cowonly = 1; /* * if we know the block isn't shared @@ -727,14 +969,12 @@ again: /* * add the block to pending list if we - * need check its backrefs. only block - * at 'cur->level + 1' is added to the - * tail of pending list. this guarantees - * we check backrefs from lower level - * blocks to upper level blocks. + * need check its backrefs, we only do this once + * while walking up a tree as we will catch + * anything else later on. */ - if (!upper->checked && - level == cur->level + 1) { + if (!upper->checked && need_check) { + need_check = false; list_add_tail(&edge->list[UPPER], &list); } else @@ -744,17 +984,19 @@ again: rb_node); BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); } list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[UPPER] = upper; edge->node[LOWER] = lower; + edge->node[UPPER] = upper; if (rb_node) break; lower = upper; upper = NULL; } - btrfs_release_path(root, path2); + btrfs_release_path(path2); next: if (ptr < end) { ptr += btrfs_extent_inline_ref_size(key.type); @@ -767,7 +1009,7 @@ next: if (ptr >= end) path1->slots[0]++; } - btrfs_release_path(rc->extent_root, path1); + btrfs_release_path(path1); cur->checked = 1; WARN_ON(exist); @@ -785,8 +1027,14 @@ next: * into the cache. */ BUG_ON(!node->checked); - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); + list_add_tail(&node->lower, &cache->leaves); + } list_for_each_entry(edge, &node->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); @@ -795,6 +1043,14 @@ next: edge = list_entry(list.next, struct backref_edge, list[UPPER]); list_del_init(&edge->list[UPPER]); upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } if (!RB_EMPTY_NODE(&upper->rb_node)) { if (upper->lowest) { @@ -807,25 +1063,71 @@ next: } BUG_ON(!upper->checked); - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - BUG_ON(rb_node); + BUG_ON(cowonly != upper->cowonly); + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + upper->bytenr); + } list_add_tail(&edge->list[UPPER], &upper->lower); list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + BUG_ON(!list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } out: btrfs_free_path(path1); btrfs_free_path(path2); if (err) { - INIT_LIST_HEAD(&list); + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, upper); + list_del_init(&lower->upper); + } upper = node; + INIT_LIST_HEAD(&list); while (upper) { if (RB_EMPTY_NODE(&upper->rb_node)) { list_splice_tail(&upper->upper, &list); - kfree(upper); + free_backref_node(cache, upper); } if (list_empty(&list)) @@ -833,25 +1135,119 @@ out: edge = list_entry(list.next, struct backref_edge, list[LOWER]); + list_del(&edge->list[LOWER]); upper = edge->node[UPPER]; - kfree(edge); + free_backref_edge(cache, edge); } return ERR_PTR(err); } + BUG_ON(node && node->detached); return node; } /* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->checked = 1; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } else { + list_add_tail(&new_node->lower, &cache->leaves); + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __add_reloc_root(struct btrfs_root *root) +static int __must_check __add_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = root->fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); - BUG_ON(!node); + if (!node) + return -ENOMEM; node->bytenr = root->node->start; node->data = root; @@ -860,17 +1256,23 @@ static int __add_reloc_root(struct btrfs_root *root) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) { + btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " + "for start=%llu while inserting into relocation " + "tree", node->bytenr); + kfree(node); + return -EEXIST; + } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -878,72 +1280,112 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); - } else { - list_del_init(&root->root_list); - kfree(node); - } - return 0; + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); } /* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. + * helper to update the 'address of tree root -> reloc tree' + * mapping */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); + return 0; +} + +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) { struct btrfs_root *reloc_root; struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; + u64 last_snap = 0; int ret; - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!root->fs_info->reloc_ctl || - !root->fs_info->reloc_ctl->create_reloc_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); BUG_ON(!root_item); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = root->root_key.objectid; + root_key.offset = objectid; - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + last_snap = btrfs_root_last_snapshot(&root->root_item); + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } - btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_refs(root_item, 1); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + /* + * abuse rtransid, it is safe because it is impossible to + * receive data into a relocation tree. + */ + btrfs_set_root_rtransid(root_item, last_snap); + btrfs_set_root_otransid(root_item, trans->transid); + } btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -953,12 +1395,46 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); + reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + struct btrfs_block_rsv *rsv; + int clear_rsv = 0; + int ret; - __add_reloc_root(reloc_root); + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->reloc_reserved) { + rsv = trans->block_rsv; + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = rsv; + + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); root->reloc_root = reloc_root; return 0; } @@ -971,22 +1447,20 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; - if (btrfs_root_refs(root_item) == 0) { + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -996,6 +1470,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -1018,9 +1494,9 @@ again: prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); - if (objectid < entry->vfs_inode.i_ino) + if (objectid < btrfs_ino(&entry->vfs_inode)) node = node->rb_left; - else if (objectid > entry->vfs_inode.i_ino) + else if (objectid > btrfs_ino(&entry->vfs_inode)) node = node->rb_right; else break; @@ -1028,7 +1504,7 @@ again: if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= entry->vfs_inode.i_ino) { + if (objectid <= btrfs_ino(&entry->vfs_inode)) { node = prev; break; } @@ -1043,7 +1519,7 @@ again: return inode; } - objectid = entry->vfs_inode.i_ino + 1; + objectid = btrfs_ino(&entry->vfs_inode) + 1; if (cond_resched_lock(&root->inode_lock)) goto again; @@ -1079,7 +1555,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, return -ENOMEM; bytenr -= BTRFS_I(reloc_inode)->index_cnt; - ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode), bytenr, 0); if (ret < 0) goto out; @@ -1098,12 +1574,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_other_encoding(leaf, fi)); if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = 1; + ret = -EINVAL; goto out; } - if (new_bytenr) - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); @@ -1114,24 +1589,23 @@ out: * update file extent items in the tree leaf to point to * the new locations. */ -static int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct list_head *inode_list) +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) { struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; - struct inodevec *ivec = NULL; u64 parent; u64 bytenr; - u64 new_bytenr; + u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; u32 i; - int ret; + int ret = 0; int first = 1; int dirty = 0; @@ -1166,23 +1640,14 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (!ivec || ivec->nr == INODEVEC_SIZE) { - ivec = kmalloc(sizeof(*ivec), GFP_NOFS); - BUG_ON(!ivec); - ivec->nr = 0; - list_add_tail(&ivec->list, inode_list); - } if (first) { inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; first = 0; - } else if (inode && inode->i_ino < key.objectid) { + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; } - if (inode && inode->i_ino == key.objectid) { + if (inode && btrfs_ino(inode) == key.objectid) { end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1190,23 +1655,26 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, root->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - GFP_NOFS); + key.offset, end); if (!ret) continue; btrfs_drop_extent_cache(inode, key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, GFP_NOFS); + key.offset, end); } } ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); - if (ret > 0) - continue; - BUG_ON(ret < 0); + if (ret) { + /* + * Don't have to abort since we've not changed anything + * in the file extent yet. + */ + break; + } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); dirty = 1; @@ -1215,17 +1683,25 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); - BUG_ON(ret); + key.objectid, key.offset, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); - BUG_ON(ret); + key.objectid, key.offset, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } } if (dirty) btrfs_mark_buffer_dirty(leaf); - return 0; + if (inode) + btrfs_add_delayed_iput(inode); + return ret; } static noinline_for_stack @@ -1248,11 +1724,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ -static int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - struct extent_buffer **leaf, - int lowest_level, int max_level) +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) { struct extent_buffer *eb; struct extent_buffer *parent; @@ -1263,16 +1739,16 @@ static int replace_path(struct btrfs_trans_handle *trans, u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; + int cow = 0; int level; int ret; int slot; BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(lowest_level > 1 && leaf); last_snapshot = btrfs_root_last_snapshot(&src->root_item); - +again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); @@ -1286,8 +1762,10 @@ static int replace_path(struct btrfs_trans_handle *trans, return 0; } - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } btrfs_set_lock_blocking(eb); if (next_key) { @@ -1323,32 +1801,32 @@ static int replace_path(struct btrfs_trans_handle *trans, new_ptr_gen = 0; } - if (new_bytenr > 0 && new_bytenr == old_bytenr) { - WARN_ON(1); + if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { ret = level; break; } if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level && !leaf) { + if (level <= lowest_level) { ret = 0; break; } eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); - btrfs_tree_lock(eb); - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (level <= lowest_level) { - *leaf = eb; - ret = 0; + if (!eb || !extent_buffer_uptodate(eb)) { + ret = (!eb) ? -ENOMEM : -EIO; + free_extent_buffer(eb); break; } + btrfs_tree_lock(eb); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + } + btrfs_set_lock_blocking(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1357,9 +1835,16 @@ static int replace_path(struct btrfs_trans_handle *trans, continue; } + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); - btrfs_release_path(src, path); + btrfs_release_path(path); path->lowest_level = level; ret = btrfs_search_slot(trans, src, &key, path, 0, 1); @@ -1381,21 +1866,23 @@ static int replace_path(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -1485,6 +1972,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); blocksize = btrfs_level_size(root, i - 1); eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; @@ -1503,6 +1994,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, struct inode *inode = NULL; u64 objectid; u64 start, end; + u64 ino; objectid = min_key->objectid; while (1) { @@ -1515,17 +2007,18 @@ static int invalidate_extent_cache(struct btrfs_root *root, inode = find_next_inode(root, objectid); if (!inode) break; + ino = btrfs_ino(inode); - if (inode->i_ino > max_key->objectid) { + if (ino > max_key->objectid) { iput(inode); break; } - objectid = inode->i_ino + 1; + objectid = ino + 1; if (!S_ISREG(inode->i_mode)) continue; - if (unlikely(min_key->objectid == inode->i_ino)) { + if (unlikely(min_key->objectid == ino)) { if (min_key->type > BTRFS_EXTENT_DATA_KEY) continue; if (min_key->type < BTRFS_EXTENT_DATA_KEY) @@ -1538,7 +2031,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, start = 0; } - if (unlikely(max_key->objectid == inode->i_ino)) { + if (unlikely(max_key->objectid == ino)) { if (max_key->type < BTRFS_EXTENT_DATA_KEY) continue; if (max_key->type > BTRFS_EXTENT_DATA_KEY) { @@ -1555,27 +2048,13 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; } -static void put_inodes(struct list_head *list) -{ - struct inodevec *ivec; - while (!list_empty(list)) { - ivec = list_entry(list->next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } -} - static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1604,21 +2083,22 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; - struct extent_buffer *leaf = NULL; - unsigned long nr; + struct extent_buffer *leaf; int level; int max_level; int replaced = 0; int ret; int err = 0; + u32 min_reserved; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1648,34 +2128,25 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { - trans = btrfs_start_transaction(root, 1); - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(reloc_root, path); + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { + while (1) { + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { err = ret; goto out; } + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + goto out; + } + trans->block_rsv = rc->block_rsv; - leaf = path->nodes[0]; - btrfs_unlock_up_safe(path, 1); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - if (ret < 0) - err = ret; - goto out; - } - - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - leaf = NULL; replaced = 0; - trans = btrfs_start_transaction(root, 1); max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); @@ -1689,14 +2160,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; - } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_path(trans, root, reloc_root, - path, &next_key, &leaf, - level, max_level); } else { - ret = replace_path(trans, root, reloc_root, - path, &next_key, NULL, - level, max_level); + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); } if (ret < 0) { err = ret; @@ -1708,16 +2174,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; - } else if (leaf) { - /* - * no block got replaced, try replacing file extents - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - BUG_ON(ret < 0); } ret = walk_up_reloc_tree(reloc_root, path, &level); @@ -1733,15 +2189,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path->slots[level]); root_item->drop_level = level; - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); + btrfs_end_transaction_throttle(trans, root); + trans = NULL; - /* - * put inodes outside transaction, otherwise we may deadlock. - */ - put_inodes(&inode_list); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -1765,14 +2216,13 @@ out: sizeof(root_item->drop_progress)); root_item->drop_level = 0; btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); + if (trans) + btrfs_end_transaction_throttle(trans, root); - put_inodes(&inode_list); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -1780,74 +2230,174 @@ out: return err; } -/* - * callback for the work threads. - * this function merges reloc tree with corresponding fs tree, - * and then drops the reloc tree. - */ -static void merge_func(struct btrfs_work *work) +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = rc->extent_root; struct btrfs_root *reloc_root; - struct async_merge *async; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + + mutex_lock(&root->fs_info->reloc_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->reloc_mutex); + +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + goto again; + } + } - async = container_of(work, struct async_merge, work); - reloc_root = async->root; + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); - if (btrfs_root_refs(&reloc_root->root_item) > 0) { root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); - merge_reloc_root(async->rc, root); - - trans = btrfs_start_transaction(root, 1); + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); btrfs_update_reloc_root(trans, root); - btrfs_end_transaction(trans, root); + + list_add(&reloc_root->root_list, &reloc_roots); } - btrfs_drop_snapshot(reloc_root, 0); + list_splice(&reloc_roots, &rc->reloc_roots); - if (atomic_dec_and_test(async->num_pending)) - complete(async->done); + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; +} - kfree(async); +static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root; + + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); + __del_reloc_root(reloc_root); + } } -static int merge_reloc_roots(struct reloc_control *rc) +static noinline_for_stack +int merge_reloc_roots(struct reloc_control *rc) { - struct async_merge *async; struct btrfs_root *root; - struct completion done; - atomic_t num_pending; + struct btrfs_root *reloc_root; + u64 last_snap; + u64 otransid; + u64 objectid; + LIST_HEAD(reloc_roots); + int found = 0; + int ret = 0; +again: + root = rc->extent_root; - init_completion(&done); - atomic_set(&num_pending, 1); + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); - while (!list_empty(&rc->reloc_roots)) { - root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&root->root_list); + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); - async = kmalloc(sizeof(*async), GFP_NOFS); - BUG_ON(!async); - async->work.func = merge_func; - async->work.flags = 0; - async->rc = rc; - async->root = root; - async->done = &done; - async->num_pending = &num_pending; - atomic_inc(&num_pending); - btrfs_queue_worker(&rc->workers, &async->work); + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + ret = merge_reloc_root(rc, root); + if (ret) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } + } else { + list_del_init(&reloc_root->root_list); + } + + /* + * we keep the old last snapshod transid in rtranid when we + * created the relocation tree. + */ + last_snap = btrfs_root_rtransid(&reloc_root->root_item); + otransid = btrfs_root_otransid(&reloc_root->root_item); + objectid = reloc_root->root_key.offset; + + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + if (ret < 0) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } } - if (!atomic_dec_and_test(&num_pending)) - wait_for_completion(&done); + if (found) { + found = 0; + goto again; + } +out: + if (ret) { + btrfs_std_error(root->fs_info, ret); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; + return ret; } static void free_block_list(struct rb_root *blocks) @@ -1876,119 +2426,175 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return btrfs_record_root_in_trans(trans, root); } -/* - * select one tree from trees that references the block. - * for blocks in refernce counted trees, we preper reloc tree. - * if no reloc tree found and reloc_only is true, NULL is returned. - */ -static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], - int *nr, int reloc_only) +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[]) { struct backref_node *next; struct btrfs_root *root; - int index; - int loop = 0; -again: - index = 0; + int index = 0; + next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - if (!root) { - BUG_ON(!node->old_root); - goto skip; - } - - /* no other choice for non-refernce counted tree */ - if (!root->ref_cows) { - BUG_ON(reloc_only); - break; - } + BUG_ON(!root); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); break; } - if (loop) { - btrfs_record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); break; } - if (reloc_only || next != node) { - if (!root->reloc_root) - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - /* - * if the reloc tree was created in current - * transation, there is no node in backref tree - * corresponds to the root of the reloc tree. - */ - if (btrfs_root_last_snapshot(&root->root_item) == - trans->transid - 1) - break; - } -skip: + WARN_ON(1); root = NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } + if (!root) + return NULL; - if (!root && !loop && !reloc_only) { - loop = 1; - goto again; + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; } - - if (root) - *nr = index; - else - *nr = 0; - return root; } +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, struct backref_node *node) { + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int nr; - return __select_one_root(trans, node, edges, &nr, 0); + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-references counted tree */ + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; } static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], int *nr) +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) { - return __select_one_root(trans, node, edges, nr, 1); + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += btrfs_level_size(rc->extent_root, + next->level); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; } -static void grab_path_buffers(struct btrfs_path *path, - struct backref_node *node, - struct backref_edge *edges[], int nr) +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) { - int i = 0; - while (1) { - drop_node_buffer(node); - node->eb = path->nodes[node->level]; - BUG_ON(!node->eb); - if (path->locks[node->level]) - node->locked = 1; - path->nodes[node->level] = NULL; - path->locks[node->level] = 0; - - if (i >= nr) - break; - - edges[i]->blockptr = node->eb->start; - node = edges[i]->node[UPPER]; - i++; + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + u64 tmp; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; + + trans->block_rsv = rc->block_rsv; + rc->reserved_bytes += num_bytes; + ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (ret == -EAGAIN) { + tmp = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + } + return ret; } + + return 0; } /* @@ -1999,6 +2605,7 @@ static void grab_path_buffers(struct btrfs_path *path, * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) @@ -2011,7 +2618,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, u32 blocksize; u64 bytenr; u64 generation; - int nr; int slot; int ret; int err = 0; @@ -2019,18 +2625,25 @@ static int do_relocation(struct btrfs_trans_handle *trans, BUG_ON(lowest && node->eb); path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { cond_resched(); - if (node->eb && node->eb->start == edge->blockptr) - continue; upper = edge->node[UPPER]; - root = select_reloc_root(trans, upper, edges, &nr); - if (!root) - continue; - - if (upper->eb && !upper->locked) + root = select_reloc_root(trans, rc, upper, edges); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } drop_node_buffer(upper); + } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); @@ -2040,12 +2653,18 @@ static int do_relocation(struct btrfs_trans_handle *trans, } BUG_ON(ret > 0); - slot = path->slots[upper->level]; + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } - btrfs_unlock_up_safe(path, upper->level + 1); - grab_path_buffers(path, upper, edges, nr); + upper->locked = 1; + path->locks[upper->level] = 0; - btrfs_release_path(NULL, path); + slot = path->slots[upper->level]; + btrfs_release_path(path); } else { ret = btrfs_bin_search(upper->eb, key, upper->level, &slot); @@ -2053,32 +2672,34 @@ static int do_relocation(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(upper->eb, slot); - if (!lowest) { - if (node->eb->start == bytenr) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - continue; - } + if (lowest) { + BUG_ON(bytenr != node->bytenr); } else { - BUG_ON(node->bytenr != bytenr); + if (node->eb->start == bytenr) + goto next; } blocksize = btrfs_level_size(root, node->level); generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + err = -EIO; + goto next; + } btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); if (ret < 0) { err = ret; - break; + goto next; } - btrfs_set_lock_blocking(eb); - node->eb = eb; - node->locked = 1; + BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2090,73 +2711,86 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0); + node->level, 0, 1); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); } - if (!lowest) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); return err; } static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_path *path) { struct btrfs_key key; - if (!node->eb || list_empty(&node->upper)) - return 0; btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, node, &key, path, 0); + return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct backref_cache *cache, - struct btrfs_path *path) + struct reloc_control *rc, + struct btrfs_path *path, int err) { + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; struct backref_node *node; int level; int ret; - int err = 0; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, - struct backref_node, lower); - BUG_ON(node->level != level); + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); - ret = link_to_upper(trans, node, path); - if (ret < 0) - err = ret; - /* - * this remove the node from the pending list and - * may add some other nodes to the level + 1 - * pending list - */ - remove_backref_node(cache, node); + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } } + list_splice_init(&list, &cache->pending[level]); } - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); return err; } static void mark_block_processed(struct reloc_control *rc, - struct backref_node *node) + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) { u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { blocksize = btrfs_level_size(rc->extent_root, node->level); - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - GFP_NOFS); + mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; } @@ -2179,7 +2813,7 @@ static void update_processed_blocks(struct reloc_control *rc, if (next->processed) break; - mark_block_processed(rc, next); + __mark_block_processed(rc, next); if (list_empty(&next->upper)) break; @@ -2202,138 +2836,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, return 0; } -/* - * check if there are any file extent pointers in the leaf point to - * data require processing - */ -static int check_file_extents(struct reloc_control *rc, - u64 bytenr, u32 blocksize, u64 ptr_gen) -{ - struct btrfs_key found_key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - u32 nritems; - int i; - int ret = 0; - - leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &found_key, i); - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) - continue; - if (in_block_group(bytenr, rc->block_group)) { - ret = 1; - break; - } - } - free_extent_buffer(leaf); - return ret; -} - -/* - * scan child blocks of a given block to find blocks require processing - */ -static int add_child_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; - u32 nritems; - int i; - int err = 0; - - nritems = btrfs_header_nritems(node->eb); - blocksize = btrfs_level_size(rc->extent_root, node->level - 1); - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - - readahead_tree_block(rc->extent_root, - bytenr, blocksize, ptr_gen); - } - - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - if (!in_block_group(bytenr, rc->block_group) && - !check_file_extents(rc, bytenr, blocksize, ptr_gen)) - continue; - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = bytenr; - btrfs_node_key_to_cpu(node->eb, &block->key, i); - block->level = node->level - 1; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - } - if (err) - free_block_list(blocks); - return err; -} - -/* - * find adjacent blocks require processing - */ -static noinline_for_stack -int add_adjacent_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_cache *cache, - struct rb_root *blocks, int level, - struct backref_node **upper) -{ - struct backref_node *node; - int ret = 0; - - WARN_ON(!list_empty(&cache->pending[level])); - - if (list_empty(&cache->pending[level + 1])) - return 1; - - node = list_entry(cache->pending[level + 1].next, - struct backref_node, lower); - if (node->eb) - ret = add_child_blocks(trans, rc, node, blocks); - - *upper = node; - return ret; -} - static int get_tree_block_key(struct reloc_control *rc, struct tree_block *block) { @@ -2342,6 +2844,10 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); @@ -2356,8 +2862,13 @@ static int reada_tree_block(struct reloc_control *rc, struct tree_block *block) { BUG_ON(block->key_ready); - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + if (block->key.type == BTRFS_METADATA_ITEM_KEY) + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, + rc->extent_root->leafsize); + else + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); return 0; } @@ -2371,40 +2882,48 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int ret; + int ret = 0; + if (!node) + return 0; + + BUG_ON(node->processed); root = select_one_root(trans, node); - if (unlikely(!root)) { - rc->found_old_snapshot = 1; + if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); - return 0; + goto out; } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = do_relocation(trans, node, key, path, 1); - if (ret < 0) - goto out; - if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, - node->eb, NULL); - if (ret < 0) - goto out; - } - drop_node_buffer(node); - } else if (!root->ref_cows) { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(root, path); - if (ret < 0) + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) goto out; - } else if (root != node->root) { - WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); } - update_processed_blocks(rc, node); - ret = 0; + if (root) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } out: - drop_node_buffer(node); + if (ret || node->level == 0 || node->cowonly) + remove_backref_node(&rc->backref_cache, node); return ret; } @@ -2415,34 +2934,22 @@ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { - struct backref_cache *cache; struct backref_node *node; struct btrfs_path *path; struct tree_block *block; struct rb_node *rb_node; - int level = -1; int ret; int err = 0; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - cache = kmalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - btrfs_free_path(path); - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out_free_blocks; } - backref_cache_init(cache); - rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (level == -1) - level = block->level; - else - BUG_ON(level != block->level); if (!block->key_ready) reada_tree_block(rc, block); rb_node = rb_next(rb_node); @@ -2451,8 +2958,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - get_tree_block_key(rc, block); + if (!block->key_ready) { + err = get_tree_block_key(rc, block); + if (err) + goto out_free_path; + } rb_node = rb_next(rb_node); } @@ -2460,7 +2970,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - node = build_backref_tree(rc, cache, &block->key, + node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2470,79 +2980,64 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - err = ret; + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; goto out; } - remove_backref_node(cache, node); rb_node = rb_next(rb_node); } +out: + err = finish_pending_nodes(trans, rc, path, err); - if (level > 0) - goto out; - +out_free_path: + btrfs_free_path(path); +out_free_blocks: free_block_list(blocks); + return err; +} - /* - * now backrefs of some upper level tree blocks have been cached, - * try relocating blocks referenced by these upper level blocks. - */ - while (1) { - struct backref_node *upper = NULL; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - break; - - ret = add_adjacent_blocks(trans, rc, cache, blocks, level, - &upper); - if (ret < 0) - err = ret; - if (ret != 0) - break; +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - goto out; - BUG_ON(!block->key_ready); - node = build_backref_tree(rc, cache, &block->key, - level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); - ret = relocate_tree_block(trans, rc, node, - &block->key, path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - free_block_list(blocks); + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start); + if (ret) + goto out; - if (upper) { - ret = link_to_upper(trans, upper, path); - if (ret < 0) { - err = ret; - break; - } - remove_backref_node(cache, upper); - } + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + if (ret) + break; + nr++; } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); out: - free_block_list(blocks); - - ret = finish_pending_nodes(trans, cache, path); - if (ret < 0) - err = ret; - - kfree(cache); - btrfs_free_path(path); - return err; + mutex_unlock(&inode->i_mutex); + return ret; } static noinline_for_stack @@ -2554,7 +3049,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, struct extent_map *em; int ret = 0; - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); if (!em) return -ENOMEM; @@ -2565,10 +3060,10 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -2576,7 +3071,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } btrfs_drop_extent_cache(inode, start, end, 0); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; } @@ -2588,9 +3083,9 @@ static int relocate_file_extent_cluster(struct inode *inode, u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int nr = 0; int ret = 0; @@ -2601,30 +3096,36 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; - mutex_lock(&inode->i_mutex); + file_ra_state_init(ra, inode->i_mapping); - i_size_write(inode, cluster->end + 1 - offset); ret = setup_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); + goto out; - WARN_ON(cluster->start != cluster->boundary[0]); + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + page = find_lock_page(inode->i_mapping, index); if (!page) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, + mask); if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -ENOMEM; - goto out_unlock; + goto out; } } @@ -2640,16 +3141,17 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -EIO; - goto out_unlock; + goto out; } } - page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); set_page_extent_mapped(page); @@ -2660,31 +3162,21 @@ static int relocate_file_extent_cluster(struct inode *inode, EXTENT_BOUNDARY, GFP_NOFS); nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); set_page_dirty(page); - dirty_page++; unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + page_start, page_end); unlock_page(page); page_cache_release(page); index++; - if (nr < cluster->nr && - page_end + 1 + offset == cluster->boundary[nr]) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); - dirty_page = 0; - } - } - if (dirty_page) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); } WARN_ON(nr != cluster->nr); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: kfree(ra); return ret; } @@ -2777,17 +3269,22 @@ static int add_tree_block(struct reloc_control *rc, struct rb_node *rb_node; u32 item_size; int level = -1; - int generation; + u64 generation; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); - if (item_size >= sizeof(*ei) + sizeof(*bi)) { + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - bi = (struct btrfs_tree_block_info *)(ei + 1); + if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + level = btrfs_tree_block_level(eb, bi); + } else { + level = (int)extent_key->offset; + } generation = btrfs_extent_generation(eb, ei); - level = btrfs_tree_block_level(eb, bi); } else { #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 u64 ref_owner; @@ -2796,6 +3293,8 @@ static int add_tree_block(struct reloc_control *rc, BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, extent_key, &ref_owner, NULL); + if (ret < 0) + return ret; BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); level = (int)ref_owner; /* FIXME: get real generation */ @@ -2805,7 +3304,7 @@ static int add_tree_block(struct reloc_control *rc, #endif } - btrfs_release_path(rc->extent_root, path); + btrfs_release_path(path); BUG_ON(level == -1); @@ -2814,13 +3313,14 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = extent_key->offset; + block->key.objectid = rc->extent_root->leafsize; block->key.offset = generation; block->level = level; block->key_ready = 0; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, block->bytenr); return 0; } @@ -2835,6 +3335,8 @@ static int __add_tree_block(struct reloc_control *rc, struct btrfs_path *path; struct btrfs_key key; int ret; + bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, + SKINNY_METADATA); if (tree_block_processed(bytenr, blocksize, rc)) return 0; @@ -2845,19 +3347,42 @@ static int __add_tree_block(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) goto out; + + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } + } BUG_ON(ret); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = add_tree_block(rc, &key, path, blocks); out: btrfs_free_path(path); @@ -2870,9 +3395,6 @@ out: static int block_use_full_backref(struct reloc_control *rc, struct extent_buffer *eb) { - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct btrfs_key key; u64 flags; int ret; @@ -2880,28 +3402,58 @@ static int block_use_full_backref(struct reloc_control *rc, btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) return 1; - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = eb->start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = eb->len; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, btrfs_header_level(eb), 1, + NULL, &flags); BUG_ON(ret); - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - flags = btrfs_extent_flags(path->nodes[0], ei); - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = 1; else ret = 0; - btrfs_free_path(path); + return ret; +} + +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 ino) +{ + struct btrfs_key key; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (inode) + goto truncate; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode) || is_bad_inode(inode)) { + if (!IS_ERR(inode)) + iput(inode); + return -ENOENT; + } + +truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_truncate_free_space_cache(root, trans, inode); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); +out: + iput(inode); return ret; } @@ -2931,15 +3483,28 @@ static int find_data_references(struct reloc_control *rc, int counted; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ref_root = btrfs_extent_data_ref_root(leaf, ref); ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); ref_offset = btrfs_extent_data_ref_offset(leaf, ref); ref_count = btrfs_extent_data_ref_count(leaf, ref); + /* + * This is an extent belonging to the free space cache, lets just delete + * it and redo the search. + */ + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { + ret = delete_block_group_cache(rc->extent_root->fs_info, + NULL, ref_objectid); + if (ret != -ENOENT) + return ret; + ret = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -2947,8 +3512,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; @@ -2983,10 +3551,8 @@ static int find_data_references(struct reloc_control *rc, err = ret; goto out; } - if (ret > 0) { - WARN_ON(1); + if (WARN_ON(ret > 0)) goto out; - } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3006,11 +3572,9 @@ static int find_data_references(struct reloc_control *rc, } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) { - WARN_ON(1); + if (WARN_ON(key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY)) break; - } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3044,7 +3608,9 @@ static int find_data_references(struct reloc_control *rc, block->key_ready = 1; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + block->bytenr); } if (counted) added = 1; @@ -3060,7 +3626,7 @@ out: } /* - * hepler to find all tree blocks that reference a given data extent + * helper to find all tree blocks that reference a given data extent */ static noinline_for_stack int add_data_references(struct reloc_control *rc, @@ -3074,22 +3640,10 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize; - int ret; + u32 blocksize = btrfs_level_size(rc->extent_root, 0); + int ret = 0; int err = 0; - ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, - extent_key->offset); - BUG_ON(ret < 0); - if (ret > 0) { - /* the relocated data is fragmented */ - rc->extents_skipped++; - btrfs_release_path(rc->extent_root, path); - return 0; - } - - blocksize = btrfs_level_size(rc->extent_root, 0); - eb = path->nodes[0]; ptr = btrfs_item_ptr_offset(eb, path->slots[0]); end = ptr + btrfs_item_size_nr(eb, path->slots[0]); @@ -3114,6 +3668,10 @@ int add_data_references(struct reloc_control *rc, } else { BUG(); } + if (ret) { + err = ret; + goto out; + } ptr += btrfs_extent_inline_ref_size(key.type); } WARN_ON(ptr > end); @@ -3159,18 +3717,20 @@ int add_data_references(struct reloc_control *rc, } path->slots[0]++; } - btrfs_release_path(rc->extent_root, path); +out: + btrfs_release_path(path); if (err) free_block_list(blocks); return err; } /* - * hepler to find next unprocessed extent + * helper to find next unprocessed extent */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path) + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) { struct btrfs_key key; struct extent_buffer *leaf; @@ -3210,42 +3770,62 @@ next: break; } - if (key.type != BTRFS_EXTENT_ITEM_KEY || + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY && key.objectid + key.offset <= rc->search_start) { path->slots[0]++; goto next; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.objectid + rc->extent_root->leafsize <= + rc->search_start) { + path->slots[0]++; + goto next; + } + ret = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret == 0 && start <= key.objectid) { - btrfs_release_path(rc->extent_root, path); + btrfs_release_path(path); rc->search_start = end + 1; } else { - rc->search_start = key.objectid + key.offset; + if (key.type == BTRFS_EXTENT_ITEM_KEY) + rc->search_start = key.objectid + key.offset; + else + rc->search_start = key.objectid + + rc->extent_root->leafsize; + memcpy(extent_key, &key, sizeof(key)); return 0; } } - btrfs_release_path(rc->extent_root, path); + btrfs_release_path(path); return ret; } static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->trans_mutex); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->trans_mutex); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) @@ -3262,48 +3842,89 @@ static int check_extent_flags(u64 flags) return 0; } +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, + BTRFS_BLOCK_RSV_TEMP); + if (!rc->block_rsv) + return -ENOMEM; + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; - struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; - unsigned long nr; u64 flags; u32 item_size; int ret; int err = 0; - - cluster = kzalloc(sizeof(*cluster), GFP_NOFS); - if (!cluster) - return -ENOMEM; + int progress = 0; path = btrfs_alloc_path(); - if (!path) { - kfree(cluster); + if (!path) return -ENOMEM; - } - - rc->extents_found = 0; - rc->extents_skipped = 0; - - rc->search_start = rc->block_group->key.objectid; - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - rc->create_reloc_root = 1; - set_reloc_control(rc); + path->reada = 1; - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } while (1) { - trans = btrfs_start_transaction(rc->extent_root, 1); + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } + progress++; + trans = btrfs_start_transaction(rc->extent_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } +restart: + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } - ret = find_next_extent(trans, rc, path); + ret = find_next_extent(trans, rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3313,9 +3934,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (item_size >= sizeof(*ei)) { flags = btrfs_extent_flags(path->nodes[0], ei); ret = check_extent_flags(flags); @@ -3336,7 +3955,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) flags = BTRFS_EXTENT_FLAG_DATA; if (path_change) { - btrfs_release_path(rc->extent_root, path); + btrfs_release_path(path); path->search_commit_root = 1; path->skip_locking = 1; @@ -3356,73 +3975,99 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { + (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { - btrfs_release_path(rc->extent_root, path); + btrfs_release_path(path); ret = 0; } if (ret < 0) { - err = 0; + err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { - err = ret; - break; + /* + * if we fail to relocate tree blocks, force to update + * backref cache when committing transaction. + */ + rc->backref_cache.last_trans = trans->transid - 1; + + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; } } - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); trans = NULL; - btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; ret = relocate_data_extent(rc->data_inode, - &key, cluster); + &key, &rc->cluster); if (ret < 0) { err = ret; break; } } } - btrfs_free_path(path); + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } + + btrfs_release_path(path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); if (trans) { - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, cluster); + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); if (ret < 0) err = ret; } - kfree(cluster); + rc->create_reloc_tree = 0; + set_reloc_control(rc); - rc->create_reloc_root = 0; - smp_mb(); + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - if (rc->extents_found > 0) { - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - } + err = prepare_to_merge(rc, err); merge_reloc_roots(rc); + rc->merge_reloc_tree = 0; unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); return err; } @@ -3448,9 +4093,10 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); + btrfs_release_path(path); out: btrfs_free_path(path); return ret; @@ -3460,14 +4106,14 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - unsigned long nr; u64 objectid = BTRFS_FIRST_FREE_OBJECTID; int err = 0; @@ -3475,10 +4121,11 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (IS_ERR(root)) return ERR_CAST(root); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out; @@ -3494,10 +4141,8 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, inode); out: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (err) { if (inode) iput(inode); @@ -3506,6 +4151,22 @@ out: return inode; } +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); + return rc; +} + /* * function to relocate all extents in a block group. */ @@ -3513,25 +4174,49 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) { struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; int ret; + int rw = 0; int err = 0; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); - INIT_LIST_HEAD(&rc->reloc_roots); + rc->extent_root = extent_root; rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size, NULL); + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } - rc->extent_root = extent_root; - btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, + path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { @@ -3540,65 +4225,51 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", - (unsigned long long)rc->block_group->key.objectid, - (unsigned long long)rc->block_group->flags); + btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", + rc->block_group->key.objectid, rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); - btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + if (ret < 0) { + err = ret; + goto out; + } + btrfs_wait_ordered_roots(fs_info, -1); while (1) { - rc->extents_found = 0; - rc->extents_skipped = 0; - mutex_lock(&fs_info->cleaner_mutex); - - btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; - break; + goto out; } if (rc->extents_found == 0) break; - printk(KERN_INFO "btrfs: found %llu extents\n", - (unsigned long long)rc->extents_found); + btrfs_info(extent_root->fs_info, "found %llu extents", + rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(rc->data_inode, 0, + (u64)-1); + if (ret) { + err = ret; + goto out; + } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; - } else if (rc->stage == UPDATE_DATA_PTRS && - rc->extents_skipped >= rc->extents_found) { - iput(rc->data_inode); - rc->data_inode = create_reloc_inode(fs_info, - rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - break; - } - rc->stage = MOVE_DATA_EXTENTS; - rc->found_file_extent = 0; } } - filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); iput(rc->data_inode); - btrfs_stop_workers(&rc->workers); btrfs_put_block_group(rc->block_group); kfree(rc); return err; @@ -3607,9 +4278,11 @@ out: static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; + int ret, err; - trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -3617,11 +4290,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); - ret = btrfs_end_transaction(trans, root->fs_info->tree_root); - BUG_ON(ret); - return 0; + err = btrfs_end_transaction(trans, root->fs_info->tree_root); + if (err) + return err; + return ret; } /* @@ -3646,6 +4319,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = -1; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3665,13 +4339,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(root->fs_info->tree_root, path); + btrfs_release_path(path); if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root_no_radix(root, &key); + reloc_root = btrfs_read_fs_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; @@ -3688,7 +4362,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } - mark_garbage_root(reloc_root); + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } } } @@ -3697,25 +4375,30 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.offset--; } - btrfs_release_path(root->fs_info->tree_root, path); + btrfs_release_path(path); if (list_empty(&reloc_roots)) goto out; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; } - mapping_tree_init(&rc->reloc_root_tree); - INIT_LIST_HEAD(&rc->reloc_roots); - btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } + + rc->merge_reloc_tree = 1; + while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3729,34 +4412,35 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(fs_root)); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out_free; + } - __add_reloc_root(reloc_root); + err = __add_reloc_root(reloc_root); + BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; } - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); + if (err) + goto out_free; merge_reloc_roots(rc); unset_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + err = btrfs_commit_transaction(trans, rc->extent_root); +out_free: + kfree(rc); out: - if (rc) { - btrfs_stop_workers(&rc->workers); - kfree(rc); - } - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + btrfs_free_path(path); if (err == 0) { @@ -3766,7 +4450,7 @@ out: if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); else - btrfs_orphan_cleanup(fs_root); + err = btrfs_orphan_cleanup(fs_root); } return err; } @@ -3780,12 +4464,11 @@ out: int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; int ret; u64 disk_bytenr; + u64 new_bytenr; LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); @@ -3793,24 +4476,166 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); + disk_bytenr + len - 1, &list, 0); + if (ret) + goto out; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + /* + * We need to offset the new_bytenr based on where the csum is. + * We need to do this because we will read in entire prealloc + * extents but we may have written to say the middle of the + * prealloc extent, so we need to make sure the csum goes with + * the right disk offset. + * + * We can do this because the data reloc inode refers strictly + * to the on disk bytes, so we don't have to worry about + * disk_len vs real len like with real inodes since it's all + * disk length. + */ + new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + sums->bytenr = new_bytenr; btrfs_add_ordered_sum(inode, ordered, sums); } +out: btrfs_put_ordered_extent(ordered); - return 0; + return ret; +} + +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret = 0; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return 0; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) + ret = replace_file_extents(trans, rc, root, cow); + return ret; +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return 0; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + if (ret) + return ret; + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); + + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) + ret = clone_backref_node(trans, rc, root, reloc_root); + return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 67fa2d29d66..360a728a639 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -16,114 +16,117 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/err.h> +#include <linux/uuid.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" /* - * search forward for a root, starting with objectid 'search_start' - * if a root key is found, the objectid we find is filled into 'found_objectid' - * and 0 is returned. < 0 is returned on error, 1 if there is nothing - * left in the tree. + * Read a root item from the tree. In case we detect a root item smaller then + * sizeof(root_item), we know it's an old version of the root structure and + * initialize all new fields to zero. The same happens if we detect mismatching + * generation numbers as then we know the root was once mounted with an older + * kernel that was not aware of the root item structure change. */ -int btrfs_search_root(struct btrfs_root *root, u64 search_start, - u64 *found_objectid) +static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) { - struct btrfs_path *path; - struct btrfs_key search_key; - int ret; - - root = root->fs_info->tree_root; - search_key.objectid = search_start; - search_key.type = (u8)-1; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - BUG_ON(!path); -again: - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) - goto out; - if (ret == 0) { - ret = 1; - goto out; - } - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - } - btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); - if (search_key.type != BTRFS_ROOT_ITEM_KEY) { - search_key.offset++; - btrfs_release_path(root, path); - goto again; + uuid_le uuid; + int len; + int need_reset = 0; + + len = btrfs_item_size_nr(eb, slot); + read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), + min_t(int, len, (int)sizeof(*item))); + if (len < sizeof(*item)) + need_reset = 1; + if (!need_reset && btrfs_root_generation(item) + != btrfs_root_generation_v2(item)) { + if (btrfs_root_generation_v2(item) != 0) { + printk(KERN_WARNING "BTRFS: mismatching " + "generation and generation_v2 " + "found in root item. This root " + "was probably mounted with an " + "older kernel. Resetting all " + "new fields.\n"); + } + need_reset = 1; } - ret = 0; - *found_objectid = search_key.objectid; + if (need_reset) { + memset(&item->generation_v2, 0, + sizeof(*item) - offsetof(struct btrfs_root_item, + generation_v2)); -out: - btrfs_free_path(path); - return ret; + uuid_le_gen(&uuid); + memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + } } /* - * lookup the root with the highest offset for a given objectid. The key we do - * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 - * on error. + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the reak key of the tree we look for + * + * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. */ -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key) +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) { - struct btrfs_path *path; - struct btrfs_key search_key; struct btrfs_key found_key; struct extent_buffer *l; int ret; int slot; - search_key.objectid = objectid; - search_key.type = BTRFS_ROOT_ITEM_KEY; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - BUG_ON(!path); - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - ret = 1; - goto out; + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; } + l = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid || + if (found_key.objectid != search_key->objectid || found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - if (item) - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), - sizeof(*item)); - if (key) - memcpy(key, &found_key, sizeof(found_key)); - ret = 0; + + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); out: - btrfs_free_path(path); + btrfs_release_path(path); return ret; } -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node) +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) { btrfs_set_root_bytenr(item, node->start); btrfs_set_root_level(item, btrfs_header_level(node)); btrfs_set_root_generation(item, btrfs_header_generation(node)); - return 0; } /* @@ -138,24 +141,67 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; + int old_len; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_CRIT "unable to update root key %llu %u %llu\n", - (unsigned long long)key->objectid, key->type, - (unsigned long long)key->offset); + btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu", + key->objectid, key->type, key->offset); BUG_ON(1); } l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); + old_len = btrfs_item_size_nr(l, slot); + + /* + * If this is the first time we update the root item which originated + * from an older kernel, we need to enlarge the item size to make room + * for the added fields. + */ + if (old_len < sizeof(*item)) { + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, key, path, + -1, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, + key, sizeof(*item)); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + } + + /* + * Update generation_v2 so at the next mount we know the new root + * fields are valid. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -163,95 +209,14 @@ out: return ret; } -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item) +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item) { - int ret; - ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); - return ret; -} - -/* - * at mount time we want to find all the old transaction snapshots that were in - * the process of being deleted if we crashed. This is any root item with an - * offset lower than the latest root. They need to be queued for deletion to - * finish what was happening when we crashed. - */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *dead_root; - struct btrfs_item *item; - struct btrfs_root_item *ri; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct extent_buffer *leaf; - int slot; - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) - goto next; - - if (key.objectid < objectid) - goto next; - - if (key.objectid > objectid) - break; - - ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_disk_root_refs(leaf, ri) != 0) - goto next; - - memcpy(&found_key, &key, sizeof(key)); - key.offset++; - btrfs_release_path(root, path); - dead_root = - btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &found_key); - if (IS_ERR(dead_root)) { - ret = PTR_ERR(dead_root); - goto err; - } - - ret = btrfs_add_dead_root(dead_root); - if (ret) - goto err; - goto again; -next: - slot++; - path->slots[0]++; - } - ret = 0; -err: - btrfs_free_path(path); - return ret; + /* + * Make sure generation v1 and v2 match. See update_root for details. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } int btrfs_find_orphan_roots(struct btrfs_root *tree_root) @@ -259,8 +224,14 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; int err = 0; int ret; + bool can_recover = true; + + if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + can_recover = false; path = btrfs_alloc_path(); if (!path) @@ -270,6 +241,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = 0; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) { @@ -288,19 +262,61 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(tree_root, path); + btrfs_release_path(path); if (key.objectid != BTRFS_ORPHAN_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_find_dead_roots(tree_root, key.offset); - if (ret) { - err = ret; + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root(tree_root, &root_key); + err = PTR_ERR_OR_ZERO(root); + if (err && err != -ENOENT) { break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_error(tree_root->fs_info, err, + "Failed to start trans to delete " + "orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_key.objectid); + btrfs_end_transaction(trans, tree_root); + if (err) { + btrfs_error(tree_root->fs_info, err, + "Failed to delete root orphan " + "item"); + break; + } + continue; } - key.offset++; + err = btrfs_init_fs_root(root); + if (err) { + btrfs_free_fs_root(root); + break; + } + + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + + err = btrfs_insert_fs_root(root->fs_info, root); + if (err) { + BUG_ON(err == -EEXIST); + btrfs_free_fs_root(root); + break; + } + + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); } btrfs_free_path(path); @@ -313,22 +329,16 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - u32 refs; - struct btrfs_root_item *ri; - struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; BUG_ON(ret != 0); - leaf = path->nodes[0]; - ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); - refs = btrfs_disk_root_refs(leaf, ri); - BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); @@ -371,37 +381,26 @@ again: *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); - BUG_ON(ret); + if (ret) { + err = ret; + goto out; + } } else err = -ENOENT; if (key.type == BTRFS_ROOT_BACKREF_KEY) { - btrfs_release_path(tree_root, path); + btrfs_release_path(path); key.objectid = ref_id; key.type = BTRFS_ROOT_REF_KEY; key.offset = root_id; goto again; } +out: btrfs_free_path(path); return err; } -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id) -{ - struct btrfs_key key; - int ret; - - key.objectid = root_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = ref_id; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - return ret; -} - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. @@ -414,6 +413,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, * * For a back ref the root_id is the id of the subvol or snapshot and * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, @@ -437,7 +438,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); @@ -449,7 +454,7 @@ again: btrfs_mark_buffer_dirty(leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { - btrfs_release_path(tree_root, path); + btrfs_release_path(path); key.objectid = ref_id; key.type = BTRFS_ROOT_REF_KEY; key.offset = root_id; @@ -459,3 +464,34 @@ again: btrfs_free_path(path); return 0; } + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + btrfs_set_stack_inode_flags(&root_item->inode, inode_flags); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); + } +} + +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root_item *item = &root->root_item; + struct timespec ct = CURRENT_TIME; + + spin_lock(&root->root_item_lock); + btrfs_set_root_ctransid(item, trans->transid); + btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); + btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); + spin_unlock(&root->root_item_lock); +} diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c new file mode 100644 index 00000000000..b6d198f5181 --- /dev/null +++ b/fs/btrfs/scrub.c @@ -0,0 +1,3489 @@ +/* + * Copyright (C) 2011, 2012 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/blkdev.h> +#include <linux/ratelimit.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" +#include "dev-replace.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "raid56.h" + +/* + * This is only the first step towards a full-features scrub. It reads all + * extent and super block and verifies the checksums. In case a bad checksum + * is found or the extent cannot be read, good data will be written back if + * any can be found. + * + * Future enhancements: + * - In case an unrepairable extent is encountered, track which files are + * affected and report them + * - track and record media errors, throw out bad devices + * - add a mode to also read unallocated space + */ + +struct scrub_block; +struct scrub_ctx; + +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ + +struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct btrfs_device *dev; + u64 flags; /* extent flags */ + u64 generation; + u64 logical; + u64 physical; + u64 physical_for_dev_replace; + atomic_t ref_count; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; + u8 csum[BTRFS_CSUM_SIZE]; +}; + +struct scrub_bio { + int index; + struct scrub_ctx *sctx; + struct btrfs_device *dev; + struct bio *bio; + int err; + u64 logical; + u64 physical; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif + int page_count; + int next_free; + struct btrfs_work work; +}; + +struct scrub_block { + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t ref_count; /* free mem on transition to zero */ + struct scrub_ctx *sctx; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + unsigned int generation_error:1; /* also sets header_error */ + }; +}; + +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; + struct btrfs_root *dev_root; + int first_free; + int curr; + atomic_t bios_in_flight; + atomic_t workers_pending; + spinlock_t list_lock; + wait_queue_head_t list_wait; + u16 csum_size; + struct list_head csum_list; + atomic_t cancel_req; + int readonly; + int pages_per_rd_bio; + u32 sectorsize; + u32 nodesize; + u32 leafsize; + + int is_dev_replace; + struct scrub_wr_ctx wr_ctx; + + /* + * statistics + */ + struct btrfs_scrub_progress stat; + spinlock_t stat_lock; +}; + +struct scrub_fixup_nodatasum { + struct scrub_ctx *sctx; + struct btrfs_device *dev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + +struct scrub_nocow_inode { + u64 inum; + u64 offset; + u64 root; + struct list_head list; +}; + +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct list_head inodes; + struct btrfs_work work; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, + struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); +} + +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + + atomic_inc(&sctx->workers_pending); +} + +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); +} + +static void scrub_free_csums(struct scrub_ctx *sctx) +{ + while (!list_empty(&sctx->csum_list)) { + struct btrfs_ordered_sum *sum; + sum = list_first_entry(&sctx->csum_list, + struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } +} + +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) +{ + int i; + + if (!sctx) + return; + + scrub_free_wr_ctx(&sctx->wr_ctx); + + /* this can happen when scrub is cancelled */ + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; + + for (i = 0; i < sbio->page_count; i++) { + WARN_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; + + if (!sbio) + break; + kfree(sbio); + } + + scrub_free_csums(sctx); + kfree(sctx); +} + +static noinline_for_stack +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +{ + struct scrub_ctx *sctx; + int i; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_rd_bio; + int ret; + + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) + goto nomem; + sctx->is_dev_replace = is_dev_replace; + sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->curr = -1; + sctx->dev_root = dev->dev_root; + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { + struct scrub_bio *sbio; + + sbio = kzalloc(sizeof(*sbio), GFP_NOFS); + if (!sbio) + goto nomem; + sctx->bios[i] = sbio; + + sbio->index = i; + sbio->sctx = sctx; + sbio->page_count = 0; + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, + NULL, NULL); + + if (i != SCRUB_BIOS_PER_SCTX - 1) + sctx->bios[i]->next_free = i + 1; + else + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->leafsize = dev->dev_root->leafsize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } + return sctx; + +nomem: + scrub_free_ctx(sctx); + return ERR_PTR(-ENOMEM); +} + +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = warn_ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, rcu_str_deref(swarn->dev->name), + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + free_ipath(ipath); + return 0; + +err: + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, rcu_str_deref(swarn->dev->name), + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) +{ + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + unsigned long ptr = 0; + u64 extent_item_pos; + u64 flags = 0; + u64 ref_root; + u32 item_size; + u8 ref_level; + const int bufsize = 4096; + int ret; + + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0]->dev; + fs_info = sblock->sctx->dev_root->fs_info; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; + swarn.errstr = errstr; + swarn.dev = NULL; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, + &flags); + if (ret < 0) + goto out; + + extent_item_pos = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + printk_in_rcu(KERN_WARNING + "BTRFS: %s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, + rcu_str_deref(dev->name), + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + btrfs_release_path(path); + } else { + btrfs_release_path(path); + swarn.path = path; + swarn.dev = dev; + iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, 1, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) +{ + struct page *page = NULL; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; + int ret; + int corrected = 0; + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + int srcu_index; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return PTR_ERR(local_root); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + fs_info = BTRFS_I(inode)->root->fs_info; + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); + + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } + +out: + if (page) + put_page(page); + + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_ctx *sctx; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sctx = fixup->sctx; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + fixup->logical, rcu_str_deref(fixup->dev->name)); + } + + btrfs_free_path(path); + kfree(fixup); + + scrub_pending_trans_workers_dec(sctx); +} + +/* + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. + */ +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) +{ + struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; + + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } + + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * + sizeof(*sblocks_for_recheck), + GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } + + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, + logical, sblocks_for_recheck); + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; + + /* build and submit the bios for the failed mirror, check checksums */ + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size); + + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + spin_unlock(&sctx->stat_lock); + + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); + goto out; + } + + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + } else if (sblock_bad->checksum_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } else if (sblock_bad->header_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + if (sblock_bad->generation_error) + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + else + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } + + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } + + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; + +nodatasum_case: + WARN_ON(sctx->is_dev_replace); + + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; + scrub_pending_trans_workers_inc(sctx); + btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, + NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; + } + + /* + * now build and submit the bios for the other mirrors, check + * checksums. + * First try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other; + + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + } else { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other, + force_write); + } + if (0 == ret) + goto corrected_error; + } + } + + /* + * for dev_replace, pick good pages and write to the target device. + */ + if (sctx->is_dev_replace) { + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + int sub_success; + + sub_success = 0; + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = + sblocks_for_recheck + mirror_index; + struct scrub_page *page_other = + sblock_other->pagev[page_num]; + + if (!page_other->io_error) { + ret = scrub_write_page_to_dev_replace( + sblock_other, page_num); + if (ret == 0) { + /* succeeded for this page */ + sub_success = 1; + break; + } else { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + } + } + } + + if (!sub_success) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + success = 0; + ret = scrub_write_page_to_dev_replace( + sblock_bad, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + } + } + + goto out; + } + + /* + * for regular scrub, repair those pages that are errored. + * In case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. + */ + + /* can only fix I/O errors from here on */ + if (sblock_bad->no_io_error_seen) + goto did_not_correct_error; + + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + + if (!page_bad->io_error) + continue; + + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + struct scrub_page *page_other = sblock_other->pagev[ + page_num]; + + if (!page_other->io_error) { + ret = scrub_repair_page_from_good_copy( + sblock_bad, sblock_other, page_num, 0); + if (0 == ret) { + page_bad->io_error = 0; + break; /* succeeded for this page */ + } + } + } + + if (page_bad->io_error) { + /* did not find a mirror to copy the page from */ + success = 0; + } + } + + if (success) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size); + if (!sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: fixed up error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); + } + } else { +did_not_correct_error: + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + logical, rcu_str_deref(dev->name)); + } + +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + int page_index; + + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + scrub_page_put(sblock->pagev[page_index]); + } + } + kfree(sblocks_for_recheck); + } + + return 0; +} + +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, + struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck) +{ + int page_index; + int mirror_index; + int ret; + + /* + * note: the two members ref_count and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + page_index = 0; + while (length > 0) { + u64 sublen = min_t(u64, length, PAGE_SIZE); + u64 mapped_length = sublen; + struct btrfs_bio *bbio = NULL; + + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < sublen) { + kfree(bbio); + return -EIO; + } + + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + if (mirror_index >= BTRFS_MAX_MIRRORS) + continue; + + sblock = sblocks_for_recheck + mirror_index; + sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); + if (!page) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + kfree(bbio); + return -ENOMEM; + } + scrub_page_get(page); + sblock->pagev[page_index] = page; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + BUG_ON(page_index >= original_sblock->page_count); + page->physical_for_dev_replace = + original_sblock->pagev[page_index]-> + physical_for_dev_replace; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; + page->mirror_num = mirror_index + 1; + sblock->page_count++; + page->page = alloc_page(GFP_NOFS); + if (!page->page) + goto leave_nomem; + } + kfree(bbio); + length -= sublen; + logical += sublen; + page_index++; + } + + return 0; +} + +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) +{ + int page_num; + + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + struct scrub_page *page = sblock->pagev[page_num]; + + if (page->dev->bdev == NULL) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + + WARN_ON(!page->page); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + bio->bi_bdev = page->dev->bdev; + bio->bi_iter.bi_sector = page->physical >> 9; + + bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (btrfsic_submit_bio_wait(READ, bio)) + sblock->no_io_error_seen = 0; + + bio_put(bio); + } + + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return; +} + +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) +{ + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + void *mapped_buffer; + + WARN_ON(!sblock->pagev[0]->page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || + memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) { + sblock->header_error = 1; + } else if (generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } + csum = h->csum; + } else { + if (!have_csum) + return; + + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); + } + + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data( + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); + + kunmap_atomic(mapped_buffer); + page_num++; + if (page_num >= sblock->page_count) + break; + WARN_ON(!sblock->pagev[page_num]->page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; +} + +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write) +{ + int page_num; + int ret = 0; + + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; + + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, + force_write); + if (ret_sub) + ret = ret_sub; + } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *page_good = sblock_good->pagev[page_num]; + + BUG_ON(page_bad->page == NULL); + BUG_ON(page_good->page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + + if (!page_bad->dev->bdev) { + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); + return -EIO; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_bdev = page_bad->dev->bdev; + bio->bi_iter.bi_sector = page_bad->physical >> 9; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + + if (btrfsic_submit_bio_wait(WRITE, bio)) { + btrfs_dev_stat_inc_and_print(page_bad->dev, + BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_replace_stats_inc( + &sblock_bad->sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + bio_put(bio); + return -EIO; + } + bio_put(bio); + } + + return 0; +} + +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ + int page_num; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + int ret; + + ret = scrub_write_page_to_dev_replace(sblock, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sblock->sctx->dev_root->fs_info->dev_replace. + num_write_errors); + } +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num) +{ + struct scrub_page *spage = sblock->pagev[page_num]; + + BUG_ON(spage->page == NULL); + if (spage->io_error) { + void *mapped_buffer = kmap_atomic(spage->page); + + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + flush_dcache_page(spage->page); + kunmap_atomic(mapped_buffer); + } + return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + int ret; + + mutex_lock(&wr_ctx->wr_lock); +again: + if (!wr_ctx->wr_curr_bio) { + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + GFP_NOFS); + if (!wr_ctx->wr_curr_bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + wr_ctx->wr_curr_bio->sctx = sctx; + wr_ctx->wr_curr_bio->page_count = 0; + } + sbio = wr_ctx->wr_curr_bio; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical_for_dev_replace; + sbio->logical = spage->logical; + sbio->dev = wr_ctx->tgtdev; + bio = sbio->bio; + if (!bio) { + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + if (!bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_wr_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_iter.bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical_for_dev_replace || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_wr_submit(sctx); + goto again; + } + + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); + return -EIO; + } + scrub_wr_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + scrub_page_get(spage); + sbio->page_count++; + if (sbio->page_count == wr_ctx->pages_per_wr_bio) + scrub_wr_submit(sctx); + mutex_unlock(&wr_ctx->wr_lock); + + return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + + if (!wr_ctx->wr_curr_bio) + return; + + sbio = wr_ctx->wr_curr_bio; + wr_ctx->wr_curr_bio = NULL; + WARN_ON(!sbio->bio->bi_bdev); + scrub_pending_bio_inc(sctx); + /* process all writes in a single worker thread. Then the block layer + * orders the requests before sending them to the driver which + * doubled the write performance on spinning disks when measured + * with Linux 3.5 */ + btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + if (sbio->err) { + struct btrfs_dev_replace *dev_replace = + &sbio->sctx->dev_root->fs_info->dev_replace; + + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + btrfs_dev_replace_stats_inc(&dev_replace-> + num_write_errors); + } + } + + for (i = 0; i < sbio->page_count; i++) + scrub_page_put(sbio->pagev[i]); + + bio_put(sbio->bio); + kfree(sbio); + scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock) +{ + u64 flags; + int ret; + + WARN_ON(sblock->page_count < 1); + flags = sblock->pagev[0]->flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); + + return ret; +} + +static int scrub_checksum_data(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; + u32 crc = ~(u32)0; + int fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0]->have_csum) + return 0; + + on_disk_csum = sblock->pagev[0]->csum; + page = sblock->pagev[0]->page; + buffer = kmap_atomic(page); + + len = sctx->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(buffer, crc, l); + kunmap_atomic(buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + buffer = kmap_atomic(page); + } + + btrfs_csum_final(crc, csum); + if (memcmp(csum, on_disk_csum, sctx->csum_size)) + fail = 1; + + return fail; +} + +static int scrub_checksum_tree_block(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_header *h; + struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0]->page; + mapped_buffer = kmap_atomic(page); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sctx->csum_size); + + /* + * we don't use the getter functions here, as we + * a) don't have an extent buffer and + * b) the page is already kmapped + */ + + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) + ++fail; + + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) + ++fail; + + if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + ++fail; + + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + ++fail; + + WARN_ON(sctx->nodesize != sctx->leafsize); + len = sctx->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + ++crc_fail; + + return fail || crc_fail; +} + +static int scrub_checksum_super(struct scrub_block *sblock) +{ + struct btrfs_super_block *s; + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; + u32 crc = ~(u32)0; + int fail_gen = 0; + int fail_cor = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0]->page; + mapped_buffer = kmap_atomic(page); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sctx->csum_size); + + if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) + ++fail_cor; + + if (sblock->pagev[0]->generation != btrfs_super_generation(s)) + ++fail_gen; + + if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + ++fail_cor; + + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + ++fail_cor; + + if (fail_cor + fail_gen) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + if (fail_cor) + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + else + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + } + + return fail_cor + fail_gen; +} + +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->ref_count); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->ref_count)) { + int i; + + for (i = 0; i < sblock->page_count; i++) + scrub_page_put(sblock->pagev[i]); + kfree(sblock); + } +} + +static void scrub_page_get(struct scrub_page *spage) +{ + atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ + if (atomic_dec_and_test(&spage->ref_count)) { + if (spage->page) + __free_page(spage->page); + kfree(spage); + } +} + +static void scrub_submit(struct scrub_ctx *sctx) +{ + struct scrub_bio *sbio; + + if (sctx->curr == -1) + return; + + sbio = sctx->bios[sctx->curr]; + sctx->curr = -1; + scrub_pending_bio_inc(sctx); + + if (!sbio->bio->bi_bdev) { + /* + * this case should not happen. If btrfs_map_block() is + * wrong, it could happen for dev-replace operations on + * missing devices when no mirrors are available, but in + * this case it should already fail the mount. + * This case is handled correctly (but _very_ slowly). + */ + printk_ratelimited(KERN_WARNING + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); + bio_endio(sbio->bio, -EIO); + } else { + btrfsic_submit_bio(READ, sbio->bio); + } +} + +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_block *sblock = spage->sblock; + struct scrub_bio *sbio; + int ret; + +again: + /* + * grab a fresh bio or wait for one to become available + */ + while (sctx->curr == -1) { + spin_lock(&sctx->list_lock); + sctx->curr = sctx->first_free; + if (sctx->curr != -1) { + sctx->first_free = sctx->bios[sctx->curr]->next_free; + sctx->bios[sctx->curr]->next_free = -1; + sctx->bios[sctx->curr]->page_count = 0; + spin_unlock(&sctx->list_lock); + } else { + spin_unlock(&sctx->list_lock); + wait_event(sctx->list_wait, sctx->first_free != -1); + } + } + sbio = sctx->bios[sctx->curr]; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical; + sbio->logical = spage->logical; + sbio->dev = spage->dev; + bio = sbio->bio; + if (!bio) { + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_iter.bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical || + sbio->dev != spage->dev) { + scrub_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } + scrub_submit(sctx); + goto again; + } + + scrub_block_get(sblock); /* one for the page added to the bio */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sctx->pages_per_rd_bio) + scrub_submit(sctx); + + return 0; +} + +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + scrub_page_get(spage); + sblock->pagev[index] = spage; + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->physical_for_dev_replace = physical_for_dev_replace; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + physical_for_dev_replace += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + if (force) + scrub_submit(sctx); + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sctx->list_lock); + sbio->next_free = sctx->first_free; + sctx->first_free = sbio->index; + spin_unlock(&sctx->list_lock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + + scrub_pending_bio_dec(sctx); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + if (!sblock->no_io_error_seen) { + scrub_handle_errored_block(sblock); + } else { + /* + * if has checksum error, write via repair mechanism in + * dev replace case, otherwise write here in dev replace + * case. + */ + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock); + } +} + +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, + u8 *csum) +{ + struct btrfs_ordered_sum *sum = NULL; + unsigned long index; + unsigned long num_sectors; + + while (!list_empty(&sctx->csum_list)) { + sum = list_first_entry(&sctx->csum_list, + struct btrfs_ordered_sum, list); + if (sum->bytenr > logical) + return 0; + if (sum->bytenr + sum->len > logical) + break; + + ++sctx->stat.csum_discards; + list_del(&sum->list); + kfree(sum); + sum = NULL; + } + if (!sum) + return 0; + + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; + num_sectors = sum->len / sctx->sectorsize; + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { + list_del(&sum->list); + kfree(sum); + } + return 1; +} + +/* scrub extent tries to collect up to 64 kB for each bio */ +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u64 physical_for_dev_replace) +{ + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed++; + sctx->stat.data_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + WARN_ON(sctx->nodesize != sctx->leafsize); + blocksize = sctx->nodesize; + spin_lock(&sctx->stat_lock); + sctx->stat.tree_extents_scrubbed++; + sctx->stat.tree_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + ++sctx->stat.no_csum; + if (sctx->is_dev_replace && !have_csum) { + ret = copy_nocow_pages(sctx, logical, l, + mirror_num, + physical_for_dev_replace); + goto behind_scrub_pages; + } + } + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, + mirror_num, have_csum ? csum : NULL, 0, + physical_for_dev_replace); +behind_scrub_pages: + if (ret) + return ret; + len -= l; + logical += l; + physical += l; + physical_for_dev_replace += l; + } + return 0; +} + +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *scrub_dev, + int num, u64 base, u64 length, + int is_dev_replace) +{ + struct btrfs_path *path; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + struct blk_plug plug; + u64 flags; + int ret; + int slot; + u64 nstripes; + struct extent_buffer *l; + struct btrfs_key key; + u64 physical; + u64 logical; + u64 logic_end; + u64 physical_end; + u64 generation; + int mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; + u64 increment = map->stripe_len; + u64 offset; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + int extent_mirror_num; + int stop_loop = 0; + + nstripes = length; + physical = map->stripes[num].physical; + offset = 0; + do_div(nstripes, map->stripe_len); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + offset = map->stripe_len * num; + increment = map->stripe_len * map->num_stripes; + mirror_num = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + offset = map->stripe_len * (num / map->sub_stripes); + increment = map->stripe_len * factor; + mirror_num = num % map->sub_stripes + 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes + 1; + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; + } else { + increment = map->stripe_len; + mirror_num = 1; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ + path->search_commit_root = 1; + path->skip_locking = 1; + + /* + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits + */ + logical = base + offset; + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + scrub_blocked_if_needed(fs_info); + + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = logic_end; + key_end.type = BTRFS_METADATA_ITEM_KEY; + key_end.offset = (u64)-1; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = logic_end; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); + + + /* + * collect all data csums for the stripe to avoid seeking during + * the scrub. This might currently (crc32) end up to be about 1MB + */ + blk_start_plug(&plug); + + /* + * now find all extents for each stripe and scrub them + */ + ret = 0; + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } + /* + * canceled? + */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + goto out; + } + /* + * check to see if we have to pause + */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* push queued extents */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + scrub_blocked_if_needed(fs_info); + } + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->leafsize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logical) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid >= logical + map->stripe_len) { + /* out of this device extent */ + if (key.objectid >= logic_end) + stop_loop = 1; + break; + } + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logical && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", + key.objectid, logical); + goto next; + } + +again: + extent_logical = key.objectid; + extent_len = bytes; + + /* + * trim extent to this stripe + */ + if (extent_logical < logical) { + extent_len -= logical - extent_logical; + extent_logical = logical; + } + if (extent_logical + extent_len > + logical + map->stripe_len) { + extent_len = logical + map->stripe_len - + extent_logical; + } + + extent_physical = extent_logical - logical + physical; + extent_dev = scrub_dev; + extent_mirror_num = mirror_num; + if (is_dev_replace) + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent(sctx, extent_logical, extent_len, + extent_physical, extent_dev, flags, + generation, extent_mirror_num, + extent_logical - logical + physical); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } + if (logical < key.objectid + bytes) { + cond_resched(); + goto again; + } + + if (physical >= physical_end) { + stop_loop = 1; + break; + } + } +next: + path->slots[0]++; + } + btrfs_release_path(path); +skip: + logical += increment; + physical += map->stripe_len; + spin_lock(&sctx->stat_lock); + if (stop_loop) + sctx->stat.last_physical = map->stripes[num].physical + + length; + else + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; + } +out: + /* push queued extents */ + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + blk_finish_plug(&plug); + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 length, + u64 dev_offset, int is_dev_replace) +{ + struct btrfs_mapping_tree *map_tree = + &sctx->dev_root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + int i; + int ret = 0; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + + if (!em) + return -EINVAL; + + map = (struct map_lookup *)em->bdev; + if (em->start != chunk_offset) + goto out; + + if (em->len < length) + goto out; + + for (i = 0; i < map->num_stripes; ++i) { + if (map->stripes[i].dev->bdev == scrub_dev->bdev && + map->stripes[i].physical == dev_offset) { + ret = scrub_stripe(sctx, map, scrub_dev, i, + chunk_offset, length, + is_dev_replace); + if (ret) + goto out; + } + } +out: + free_extent_map(em); + + return ret; +} + +static noinline_for_stack +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end, + int is_dev_replace) +{ + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_block_group_cache *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = scrub_dev->devid; + key.offset = 0ull; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.objectid != scrub_dev->devid) + break; + + if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset >= end) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (found_key.offset + length <= start) + goto skip; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + + /* + * get a reference on the corresponding block group to prevent + * the chunk from going away while we scrub it + */ + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + + dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, + chunk_offset, length, found_key.offset, + is_dev_replace); + + /* + * flush, submit all pending read and write bios, afterwards + * wait for them. + * Note that in the dev replace case, a read request causes + * write requests that are submitted in the read completion + * worker. Therefore in the current situation, it is required + * that all write requests are flushed, so that all read and + * write requests are really completed when bios_in_flight + * changes to 0. + */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ + wait_event(sctx->list_wait, + atomic_read(&sctx->workers_pending) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + + btrfs_put_block_group(cache); + if (ret) + break; + if (is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } + + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; +skip: + key.offset = found_key.offset + length; + btrfs_release_path(path); + } + + btrfs_free_path(path); + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) +{ + int i; + u64 bytenr; + u64 gen; + int ret; + struct btrfs_root *root = sctx->dev_root; + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return -EIO; + + gen = root->fs_info->last_trans_committed; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) + break; + + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, 1, bytenr); + if (ret) + return ret; + } + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + return 0; +} + +/* + * get a reference count on fs_info->scrub_workers. start worker if necessary + */ +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + int is_dev_replace) +{ + int ret = 0; + int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; + + if (fs_info->scrub_workers_refcnt == 0) { + if (is_dev_replace) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); + else + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; + goto out; + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; + goto out; + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; + goto out; + } + } + ++fs_info->scrub_workers_refcnt; +out: + return ret; +} + +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (--fs_info->scrub_workers_refcnt == 0) { + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + } + WARN_ON(fs_info->scrub_workers_refcnt < 0); +} + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace) +{ + struct scrub_ctx *sctx; + int ret; + struct btrfs_device *dev; + + if (btrfs_fs_closing(fs_info)) + return -EINVAL; + + /* + * check some assumptions + */ + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { + btrfs_err(fs_info, + "scrub: size assumption nodesize == leafsize (%d == %d) fails", + fs_info->chunk_root->nodesize, + fs_info->chunk_root->leafsize); + return -EINVAL; + } + + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", + fs_info->chunk_root->sectorsize, PAGE_SIZE); + return -EINVAL; + } + + if (fs_info->chunk_root->nodesize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || + fs_info->chunk_root->sectorsize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + /* + * would exhaust the array bounds of pagev member in + * struct scrub_block + */ + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", + fs_info->chunk_root->nodesize, + SCRUB_MAX_PAGES_PER_BLOCK, + fs_info->chunk_root->sectorsize, + SCRUB_MAX_PAGES_PER_BLOCK); + return -EINVAL; + } + + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev || (dev->missing && !is_dev_replace)) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -ENODEV; + } + + mutex_lock(&fs_info->scrub_lock); + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -EIO; + } + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -EINPROGRESS; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); + + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return ret; + } + + sctx = scrub_setup_ctx(dev, is_dev_replace); + if (IS_ERR(sctx)) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); + return PTR_ERR(sctx); + } + sctx->readonly = readonly; + dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); + atomic_inc(&fs_info->scrubs_running); + mutex_unlock(&fs_info->scrub_lock); + + if (!is_dev_replace) { + /* + * by holding device list mutex, we can + * kick off writing super in log tree sync. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + ret = scrub_supers(sctx, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + } + + if (!ret) + ret = scrub_enumerate_chunks(sctx, dev, start, end, + is_dev_replace); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_dec(&fs_info->scrubs_running); + wake_up(&fs_info->scrub_pause_wait); + + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); + + if (progress) + memcpy(progress, &sctx->stat, sizeof(*progress)); + + mutex_lock(&fs_info->scrub_lock); + dev->scrub_device = NULL; + scrub_workers_put(fs_info); + mutex_unlock(&fs_info->scrub_lock); + + scrub_free_ctx(sctx); + + return ret; +} + +void btrfs_scrub_pause(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrub_pause_req); + while (atomic_read(&fs_info->scrubs_paused) != + atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_paused) == + atomic_read(&fs_info->scrubs_running)); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); +} + +void btrfs_scrub_continue(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + atomic_dec(&fs_info->scrub_pause_req); + wake_up(&fs_info->scrub_pause_wait); +} + +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->scrub_lock); + if (!atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + + atomic_inc(&fs_info->scrub_cancel_req); + while (atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_running) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrub_cancel_req); + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct scrub_ctx *sctx; + + mutex_lock(&fs_info->scrub_lock); + sctx = dev->scrub_device; + if (!sctx) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + atomic_inc(&sctx->cancel_req); + while (dev->scrub_device) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + dev->scrub_device == NULL); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress) +{ + struct btrfs_device *dev; + struct scrub_ctx *sctx = NULL; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); + if (dev) + sctx = dev->scrub_device; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; +} + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) +{ + u64 mapped_length; + struct btrfs_bio *bbio = NULL; + int ret; + + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < extent_len || + !bbio->stripes[0].dev->bdev) { + kfree(bbio); + return; + } + + *extent_physical = bbio->stripes[0].physical; + *extent_mirror_num = bbio->mirror_num; + *extent_dev = bbio->stripes[0].dev; + kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace) +{ + WARN_ON(wr_ctx->wr_curr_bio != NULL); + + mutex_init(&wr_ctx->wr_lock); + wr_ctx->wr_curr_bio = NULL; + if (!is_dev_replace) + return 0; + + WARN_ON(!dev->bdev); + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, + bio_get_nr_vecs(dev->bdev)); + wr_ctx->tgtdev = dev; + atomic_set(&wr_ctx->flush_all_writes, 0); + return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ + mutex_lock(&wr_ctx->wr_lock); + kfree(wr_ctx->wr_curr_bio); + wr_ctx->wr_curr_bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace) +{ + struct scrub_copy_nocow_ctx *nocow_ctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); + if (!nocow_ctx) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + scrub_pending_trans_workers_inc(sctx); + + nocow_ctx->sctx = sctx; + nocow_ctx->logical = logical; + nocow_ctx->len = len; + nocow_ctx->mirror_num = mirror_num; + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; + btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + INIT_LIST_HEAD(&nocow_ctx->inodes); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + struct scrub_nocow_inode *nocow_inode; + + nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); + if (!nocow_inode) + return -ENOMEM; + nocow_inode->inum = inum; + nocow_inode->offset = offset; + nocow_inode->root = root; + list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); + return 0; +} + +#define COPY_COMPLETE 1 + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = + container_of(work, struct scrub_copy_nocow_ctx, work); + struct scrub_ctx *sctx = nocow_ctx->sctx; + u64 logical = nocow_ctx->logical; + u64 len = nocow_ctx->len; + int mirror_num = nocow_ctx->mirror_num; + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_root *root; + int not_written = 0; + + fs_info = sctx->dev_root->fs_info; + root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + not_written = 1; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + not_written = 1; + goto out; + } + + ret = iterate_inodes_from_logical(logical, fs_info, path, + record_inode_for_nocow, nocow_ctx); + if (ret != 0 && ret != -ENOENT) { + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", + logical, physical_for_dev_replace, len, mirror_num, + ret); + not_written = 1; + goto out; + } + + btrfs_end_transaction(trans, root); + trans = NULL; + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, + entry->root, nocow_ctx); + kfree(entry); + if (ret == COPY_COMPLETE) { + ret = 0; + break; + } else if (ret) { + break; + } + } +out: + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + kfree(entry); + } + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, root); + if (not_written) + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. + num_uncorrectable_read_errors); + + btrfs_free_path(path); + kfree(nocow_ctx); + + scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *nocow_ctx) +{ + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + struct btrfs_key key; + struct inode *inode; + struct page *page; + struct btrfs_root *local_root; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct extent_io_tree *io_tree; + u64 physical_for_dev_replace; + u64 len = nocow_ctx->len; + u64 lockstart = offset, lockend = offset + len - 1; + unsigned long index; + int srcu_index; + int ret = 0; + int err = 0; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return PTR_ERR(local_root); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > nocow_ctx->logical || + em->block_start + em->block_len < nocow_ctx->logical + len) { + free_extent_map(em); + goto out_unlock; + } + free_extent_map(em); + + while (len >= PAGE_CACHE_SIZE) { + index = offset >> PAGE_CACHE_SHIFT; +again: + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + btrfs_err(fs_info, "find_or_create_page() failed"); + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) + goto next_page; + } else { + ClearPageError(page); + err = extent_read_full_page_nolock(io_tree, page, + btrfs_get_extent, + nocow_ctx->mirror_num); + if (err) { + ret = err; + goto next_page; + } + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto next_page; + } + } + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; +next_page: + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + + offset += PAGE_CACHE_SIZE; + physical_for_dev_replace += PAGE_CACHE_SIZE; + len -= PAGE_CACHE_SIZE; + } + ret = COPY_COMPLETE; +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + iput(inode); + return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page) +{ + struct bio *bio; + struct btrfs_device *dev; + int ret; + + dev = sctx->wr_ctx.tgtdev; + if (!dev) + return -EIO; + if (!dev->bdev) { + printk_ratelimited(KERN_WARNING + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; + bio->bi_bdev = dev->bdev; + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + goto leave_with_eio; + + bio_put(bio); + return 0; +} diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c new file mode 100644 index 00000000000..6528aa66218 --- /dev/null +++ b/fs/btrfs/send.c @@ -0,0 +1,5791 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/bsearch.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/sort.h> +#include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/radix-tree.h> +#include <linux/vmalloc.h> +#include <linux/string.h> + +#include "send.h" +#include "backref.h" +#include "hash.h" +#include "locking.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "transaction.h" + +static int g_verbose = 0; + +#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__) + +/* + * A fs_path is a helper to dynamically build path names with unknown size. + * It reallocates the internal buffer on demand. + * It allows fast adding of path elements on the right side (normal path) and + * fast adding to the left side (reversed path). A reversed path can also be + * unreversed if needed. + */ +struct fs_path { + union { + struct { + char *start; + char *end; + + char *buf; + unsigned short buf_len:15; + unsigned short reversed:1; + char inline_buf[]; + }; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; + }; +}; +#define FS_PATH_INLINE_SIZE \ + (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + + +/* reused for each extent */ +struct clone_root { + struct btrfs_root *root; + u64 ino; + u64 offset; + + u64 found_refs; +}; + +#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 +#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) + +struct send_ctx { + struct file *send_filp; + loff_t send_off; + char *send_buf; + u32 send_size; + u32 send_max_size; + u64 total_send_size; + u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + + struct btrfs_root *send_root; + struct btrfs_root *parent_root; + struct clone_root *clone_roots; + int clone_roots_cnt; + + /* current state of the compare_tree call */ + struct btrfs_path *left_path; + struct btrfs_path *right_path; + struct btrfs_key *cmp_key; + + /* + * infos of the currently processed inode. In case of deleted inodes, + * these are the values from the deleted inode. + */ + u64 cur_ino; + u64 cur_inode_gen; + int cur_inode_new; + int cur_inode_new_gen; + int cur_inode_deleted; + u64 cur_inode_size; + u64 cur_inode_mode; + u64 cur_inode_rdev; + u64 cur_inode_last_extent; + + u64 send_progress; + + struct list_head new_refs; + struct list_head deleted_refs; + + struct radix_tree_root name_cache; + struct list_head name_cache_list; + int name_cache_size; + + struct file_ra_state ra; + + char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; +}; + +struct name_cache_entry { + struct list_head list; + /* + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. + */ + struct list_head radix_list; + u64 ino; + u64 gen; + u64 parent_ino; + u64 parent_gen; + int ret; + int need_later_update; + int name_len; + char name[]; +}; + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + +static void fs_path_reset(struct fs_path *p) +{ + if (p->reversed) { + p->start = p->buf + p->buf_len - 1; + p->end = p->start; + *p->start = 0; + } else { + p->start = p->buf; + p->end = p->start; + *p->start = 0; + } +} + +static struct fs_path *fs_path_alloc(void) +{ + struct fs_path *p; + + p = kmalloc(sizeof(*p), GFP_NOFS); + if (!p) + return NULL; + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); + return p; +} + +static struct fs_path *fs_path_alloc_reversed(void) +{ + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return NULL; + p->reversed = 1; + fs_path_reset(p); + return p; +} + +static void fs_path_free(struct fs_path *p) +{ + if (!p) + return; + if (p->buf != p->inline_buf) + kfree(p->buf); + kfree(p); +} + +static int fs_path_len(struct fs_path *p) +{ + return p->end - p->start; +} + +static int fs_path_ensure_buf(struct fs_path *p, int len) +{ + char *tmp_buf; + int path_len; + int old_buf_len; + + len++; + + if (p->buf_len >= len) + return 0; + + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + + path_len = p->end - p->start; + old_buf_len = p->buf_len; + + /* + * First time the inline_buf does not suffice + */ + if (p->buf == p->inline_buf) { + tmp_buf = kmalloc(len, GFP_NOFS); + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { + tmp_buf = krealloc(p->buf, len, GFP_NOFS); + } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + + if (p->reversed) { + tmp_buf = p->buf + old_buf_len - path_len - 1; + p->end = p->buf + p->buf_len - 1; + p->start = p->end - path_len; + memmove(p->start, tmp_buf, path_len + 1); + } else { + p->start = p->buf; + p->end = p->start + path_len; + } + return 0; +} + +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) +{ + int ret; + int new_len; + + new_len = p->end - p->start + name_len; + if (p->start != p->end) + new_len++; + ret = fs_path_ensure_buf(p, new_len); + if (ret < 0) + goto out; + + if (p->reversed) { + if (p->start != p->end) + *--p->start = '/'; + p->start -= name_len; + *prepared = p->start; + } else { + if (p->start != p->end) + *p->end++ = '/'; + *prepared = p->end; + p->end += name_len; + *p->end = 0; + } + +out: + return ret; +} + +static int fs_path_add(struct fs_path *p, const char *name, int name_len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, name_len, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, name, name_len); + +out: + return ret; +} + +static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); + if (ret < 0) + goto out; + memcpy(prepared, p2->start, p2->end - p2->start); + +out: + return ret; +} + +static int fs_path_add_from_extent_buffer(struct fs_path *p, + struct extent_buffer *eb, + unsigned long off, int len) +{ + int ret; + char *prepared; + + ret = fs_path_prepare_for_add(p, len, &prepared); + if (ret < 0) + goto out; + + read_extent_buffer(eb, prepared, off, len); + +out: + return ret; +} + +static int fs_path_copy(struct fs_path *p, struct fs_path *from) +{ + int ret; + + p->reversed = from->reversed; + fs_path_reset(p); + + ret = fs_path_add_path(p, from); + + return ret; +} + + +static void fs_path_unreverse(struct fs_path *p) +{ + char *tmp; + int len; + + if (!p->reversed) + return; + + tmp = p->start; + len = p->end - p->start; + p->start = p->buf; + p->end = p->start + len; + memmove(p->start, tmp, len + 1); + p->reversed = 0; +} + +static struct btrfs_path *alloc_path_for_send(void) +{ + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return NULL; + path->search_commit_root = 1; + path->skip_locking = 1; + path->need_commit_sem = 1; + return path; +} + +static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) +{ + int ret; + mm_segment_t old_fs; + u32 pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + while (pos < len) { + ret = vfs_write(filp, (char *)buf + pos, len - pos, off); + /* TODO handle that correctly */ + /*if (ret == -ERESTARTSYS) { + continue; + }*/ + if (ret < 0) + goto out; + if (ret == 0) { + ret = -EIO; + goto out; + } + pos += ret; + } + + ret = 0; + +out: + set_fs(old_fs); + return ret; +} + +static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) +{ + struct btrfs_tlv_header *hdr; + int total_len = sizeof(*hdr) + len; + int left = sctx->send_max_size - sctx->send_size; + + if (unlikely(left < total_len)) + return -EOVERFLOW; + + hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); + hdr->tlv_type = cpu_to_le16(attr); + hdr->tlv_len = cpu_to_le16(len); + memcpy(hdr + 1, data, len); + sctx->send_size += total_len; + + return 0; +} + +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } + +TLV_PUT_DEFINE_INT(64) + +static int tlv_put_string(struct send_ctx *sctx, u16 attr, + const char *str, int len) +{ + if (len == -1) + len = strlen(str); + return tlv_put(sctx, attr, str, len); +} + +static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, + const u8 *uuid) +{ + return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); +} + +static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, + struct extent_buffer *eb, + struct btrfs_timespec *ts) +{ + struct btrfs_timespec bts; + read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); + return tlv_put(sctx, attr, &bts, sizeof(bts)); +} + + +#define TLV_PUT(sctx, attrtype, attrlen, data) \ + do { \ + ret = tlv_put(sctx, attrtype, attrlen, data); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_INT(sctx, attrtype, bits, value) \ + do { \ + ret = tlv_put_u##bits(sctx, attrtype, value); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) +#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) +#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) +#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) +#define TLV_PUT_STRING(sctx, attrtype, str, len) \ + do { \ + ret = tlv_put_string(sctx, attrtype, str, len); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_PATH(sctx, attrtype, p) \ + do { \ + ret = tlv_put_string(sctx, attrtype, p->start, \ + p->end - p->start); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while(0) +#define TLV_PUT_UUID(sctx, attrtype, uuid) \ + do { \ + ret = tlv_put_uuid(sctx, attrtype, uuid); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ + do { \ + ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +static int send_header(struct send_ctx *sctx) +{ + struct btrfs_stream_header hdr; + + strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + + return write_buf(sctx->send_filp, &hdr, sizeof(hdr), + &sctx->send_off); +} + +/* + * For each command/item we want to send to userspace, we call this function. + */ +static int begin_cmd(struct send_ctx *sctx, int cmd) +{ + struct btrfs_cmd_header *hdr; + + if (WARN_ON(!sctx->send_buf)) + return -EINVAL; + + BUG_ON(sctx->send_size); + + sctx->send_size += sizeof(*hdr); + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->cmd = cpu_to_le16(cmd); + + return 0; +} + +static int send_cmd(struct send_ctx *sctx) +{ + int ret; + struct btrfs_cmd_header *hdr; + u32 crc; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); + hdr->crc = 0; + + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + + sctx->total_send_size += sctx->send_size; + sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->send_size = 0; + + return ret; +} + +/* + * Sends a move instruction to user space + */ +static int send_rename(struct send_ctx *sctx, + struct fs_path *from, struct fs_path *to) +{ + int ret; + +verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a link instruction to user space + */ +static int send_link(struct send_ctx *sctx, + struct fs_path *path, struct fs_path *lnk) +{ + int ret; + +verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends an unlink instruction to user space + */ +static int send_unlink(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_unlink %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a rmdir instruction to user space + */ +static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_rmdir %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Helper function to retrieve some fields from an inode item. + */ +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) +{ + int ret; + struct btrfs_inode_item *ii; + struct btrfs_key key; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; + } + + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (size) + *size = btrfs_inode_size(path->nodes[0], ii); + if (gen) + *gen = btrfs_inode_generation(path->nodes[0], ii); + if (mode) + *mode = btrfs_inode_mode(path->nodes[0], ii); + if (uid) + *uid = btrfs_inode_uid(path->nodes[0], ii); + if (gid) + *gid = btrfs_inode_gid(path->nodes[0], ii); + if (rdev) + *rdev = btrfs_inode_rdev(path->nodes[0], ii); + + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); + btrfs_free_path(path); + return ret; +} + +typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, + struct fs_path *p, + void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_inode_ref or + * btrfs_inode_extref. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the INODE_REF or INODE_EXTREF when called. + */ +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, int resolve, + iterate_inode_ref_t iterate, void *ctx) +{ + struct extent_buffer *eb = path->nodes[0]; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; + struct btrfs_path *tmp_path; + struct fs_path *p; + u32 cur = 0; + u32 total; + int slot = path->slots[0]; + u32 name_len; + char *start; + int ret = 0; + int num = 0; + int index; + u64 dir; + unsigned long name_off; + unsigned long elem_size; + unsigned long ptr; + + p = fs_path_alloc_reversed(); + if (!p) + return -ENOMEM; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) { + fs_path_free(p); + return -ENOMEM; + } + + + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); + item = btrfs_item_nr(slot); + total = btrfs_item_size(eb, item); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); + total = btrfs_item_size_nr(eb, slot); + elem_size = sizeof(*extref); + } + + while (cur < total) { + fs_path_reset(p); + + if (found_key->type == BTRFS_INODE_REF_KEY) { + iref = (struct btrfs_inode_ref *)(ptr + cur); + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + index = btrfs_inode_ref_index(eb, iref); + dir = found_key->offset; + } else { + extref = (struct btrfs_inode_extref *)(ptr + cur); + name_len = btrfs_inode_extref_name_len(eb, extref); + name_off = (unsigned long)&extref->name; + index = btrfs_inode_extref_index(eb, extref); + dir = btrfs_inode_extref_parent(eb, extref); + } + + if (resolve) { + start = btrfs_ref_to_path(root, tmp_path, name_len, + name_off, eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + if (start < p->buf) { + /* overflow , try again with larger buffer */ + ret = fs_path_ensure_buf(p, + p->buf_len + p->buf - start); + if (ret < 0) + goto out; + start = btrfs_ref_to_path(root, tmp_path, + name_len, name_off, + eb, dir, + p->buf, p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + BUG_ON(start < p->buf); + } + p->start = start; + } else { + ret = fs_path_add_from_extent_buffer(p, eb, name_off, + name_len); + if (ret < 0) + goto out; + } + + cur += elem_size + name_len; + ret = iterate(num, dir, index, p, ctx); + if (ret) + goto out; + num++; + } + +out: + btrfs_free_path(tmp_path); + fs_path_free(p); + return ret; +} + +typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_dir_item. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the dir item when called. + */ +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, + iterate_dir_item_t iterate, void *ctx) +{ + int ret = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + char *buf = NULL; + int buf_len; + u32 name_len; + u32 data_len; + u32 cur; + u32 len; + u32 total; + int slot; + int num; + u8 type; + + if (found_key->type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + + buf = kmalloc(buf_len, GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item = btrfs_item_nr(slot); + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + cur = 0; + len = 0; + total = btrfs_item_size(eb, item); + + num = 0; + while (cur < total) { + name_len = btrfs_dir_name_len(eb, di); + data_len = btrfs_dir_data_len(eb, di); + type = btrfs_dir_type(eb, di); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > buf_len) { + ret = -ENAMETOOLONG; + goto out; + } + } + + read_extent_buffer(eb, buf, (unsigned long)(di + 1), + name_len + data_len); + + len = sizeof(*di) + name_len + data_len; + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + + ret = iterate(num, &di_key, buf, name_len, buf + name_len, + data_len, type, ctx); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + num++; + } + +out: + kfree(buf); + return ret; +} + +static int __copy_first_ref(int num, u64 dir, int index, + struct fs_path *p, void *ctx) +{ + int ret; + struct fs_path *pt = ctx; + + ret = fs_path_copy(pt, p); + if (ret < 0) + return ret; + + /* we want the first only */ + return 1; +} + +/* + * Retrieve the first path of an inode. If an inode has more then one + * ref/hardlink, this is ignored. + */ +static int get_inode_path(struct btrfs_root *root, + u64 ino, struct fs_path *path) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_path *p; + + p = alloc_path_for_send(); + if (!p) + return -ENOMEM; + + fs_path_reset(path); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 1; + goto out; + } + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); + if (found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); + if (ret < 0) + goto out; + ret = 0; + +out: + btrfs_free_path(p); + return ret; +} + +struct backref_ctx { + struct send_ctx *sctx; + + struct btrfs_path *path; + /* number of total found references */ + u64 found; + + /* + * used for clones found in send_root. clones found behind cur_objectid + * and cur_offset are not considered as allowed clones. + */ + u64 cur_objectid; + u64 cur_offset; + + /* may be truncated in case it's the last extent in a file */ + u64 extent_len; + + /* Just to check for bugs in backref resolving */ + int found_itself; +}; + +static int __clone_root_cmp_bsearch(const void *key, const void *elt) +{ + u64 root = (u64)(uintptr_t)key; + struct clone_root *cr = (struct clone_root *)elt; + + if (root < cr->root->objectid) + return -1; + if (root > cr->root->objectid) + return 1; + return 0; +} + +static int __clone_root_cmp_sort(const void *e1, const void *e2) +{ + struct clone_root *cr1 = (struct clone_root *)e1; + struct clone_root *cr2 = (struct clone_root *)e2; + + if (cr1->root->objectid < cr2->root->objectid) + return -1; + if (cr1->root->objectid > cr2->root->objectid) + return 1; + return 0; +} + +/* + * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset/found_refs + */ +static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +{ + struct backref_ctx *bctx = ctx_; + struct clone_root *found; + int ret; + u64 i_size; + + /* First check if the root is in the list of accepted clone sources */ + found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!found) + return 0; + + if (found->root == bctx->sctx->send_root && + ino == bctx->cur_objectid && + offset == bctx->cur_offset) { + bctx->found_itself = 1; + } + + /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); + if (ret < 0) + return ret; + + if (offset + bctx->extent_len > i_size) + return 0; + + /* + * Make sure we don't consider clones from send_root that are + * behind the current inode/offset. + */ + if (found->root == bctx->sctx->send_root) { + /* + * TODO for the moment we don't accept clones from the inode + * that is currently send. We may change this when + * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same + * file. + */ + if (ino >= bctx->cur_objectid) + return 0; +#if 0 + if (ino > bctx->cur_objectid) + return 0; + if (offset + bctx->extent_len > bctx->cur_offset) + return 0; +#endif + } + + bctx->found++; + found->found_refs++; + if (ino < found->ino) { + found->ino = ino; + found->offset = offset; + } else if (found->ino == ino) { + /* + * same extent found more then once in the same file. + */ + if (found->offset > offset + bctx->extent_len) + found->offset = offset; + } + + return 0; +} + +/* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + * + * path must point to the extent item when called. + */ +static int find_extent_clone(struct send_ctx *sctx, + struct btrfs_path *path, + u64 ino, u64 data_offset, + u64 ino_size, + struct clone_root **found) +{ + int ret; + int extent_type; + u64 logical; + u64 disk_byte; + u64 num_bytes; + u64 extent_item_pos; + u64 flags = 0; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb = path->nodes[0]; + struct backref_ctx *backref_ctx = NULL; + struct clone_root *cur_clone_root; + struct btrfs_key found_key; + struct btrfs_path *tmp_path; + int compressed; + u32 i; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) + return -ENOMEM; + + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); + if (!backref_ctx) { + ret = -ENOMEM; + goto out; + } + + backref_ctx->path = tmp_path; + + if (data_offset >= ino_size) { + /* + * There may be extents that lie behind the file's size. + * I at least had this in combination with snapshotting while + * writing large files. + */ + ret = 0; + goto out; + } + + fi = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -ENOENT; + goto out; + } + compressed = btrfs_file_extent_compression(eb, fi); + + num_bytes = btrfs_file_extent_num_bytes(eb, fi); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) { + ret = -ENOENT; + goto out; + } + logical = disk_byte + btrfs_file_extent_offset(eb, fi); + + down_read(&sctx->send_root->fs_info->commit_root_sem); + ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, + &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); + btrfs_release_path(tmp_path); + + if (ret < 0) + goto out; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = -EIO; + goto out; + } + + /* + * Setup the clone roots. + */ + for (i = 0; i < sctx->clone_roots_cnt; i++) { + cur_clone_root = sctx->clone_roots + i; + cur_clone_root->ino = (u64)-1; + cur_clone_root->offset = 0; + cur_clone_root->found_refs = 0; + } + + backref_ctx->sctx = sctx; + backref_ctx->found = 0; + backref_ctx->cur_objectid = ino; + backref_ctx->cur_offset = data_offset; + backref_ctx->found_itself = 0; + backref_ctx->extent_len = num_bytes; + + /* + * The last extent of a file may be too large due to page alignment. + * We need to adjust extent_len in this case so that the checks in + * __iterate_backrefs work. + */ + if (data_offset + num_bytes >= ino_size) + backref_ctx->extent_len = ino_size - data_offset; + + /* + * Now collect all backrefs. + */ + if (compressed == BTRFS_COMPRESS_NONE) + extent_item_pos = logical - found_key.objectid; + else + extent_item_pos = 0; + ret = iterate_extent_inodes(sctx->send_root->fs_info, + found_key.objectid, extent_item_pos, 1, + __iterate_backrefs, backref_ctx); + + if (ret < 0) + goto out; + + if (!backref_ctx->found_itself) { + /* found a bug in backref code? */ + ret = -EIO; + btrfs_err(sctx->send_root->fs_info, "did not find backref in " + "send_root. inode=%llu, offset=%llu, " + "disk_byte=%llu found extent=%llu", + ino, data_offset, disk_byte, found_key.objectid); + goto out; + } + +verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " + "ino=%llu, " + "num_bytes=%llu, logical=%llu\n", + data_offset, ino, num_bytes, logical); + + if (!backref_ctx->found) + verbose_printk("btrfs: no clones found\n"); + + cur_clone_root = NULL; + for (i = 0; i < sctx->clone_roots_cnt; i++) { + if (sctx->clone_roots[i].found_refs) { + if (!cur_clone_root) + cur_clone_root = sctx->clone_roots + i; + else if (sctx->clone_roots[i].root == sctx->send_root) + /* prefer clones from send_root over others */ + cur_clone_root = sctx->clone_roots + i; + } + + } + + if (cur_clone_root) { + if (compressed != BTRFS_COMPRESS_NONE) { + /* + * Offsets given by iterate_extent_inodes() are relative + * to the start of the extent, we need to add logical + * offset from the file extent item. + * (See why at backref.c:check_extent_in_eb()) + */ + cur_clone_root->offset += btrfs_file_extent_offset(eb, + fi); + } + *found = cur_clone_root; + ret = 0; + } else { + ret = -ENOENT; + } + +out: + btrfs_free_path(tmp_path); + kfree(backref_ctx); + return ret; +} + +static int read_symlink(struct btrfs_root *root, + u64 ino, + struct fs_path *dest) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u8 type; + u8 compression; + unsigned long off; + int len; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + compression = btrfs_file_extent_compression(path->nodes[0], ei); + BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); + BUG_ON(compression); + + off = btrfs_file_extent_inline_start(ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); + + ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Helper function to generate a file name that is unique in the root of + * send_root and parent_root. This is used to generate names for orphan inodes. + */ +static int gen_unique_name(struct send_ctx *sctx, + u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *di; + char tmp[64]; + int len; + u64 idx = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + while (1) { + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", + ino, gen, idx); + ASSERT(len < sizeof(tmp)); + + di = btrfs_lookup_dir_item(NULL, sctx->send_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + + if (!sctx->parent_root) { + /* unique */ + ret = 0; + break; + } + + di = btrfs_lookup_dir_item(NULL, sctx->parent_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + /* unique */ + break; + } + + ret = fs_path_add(dest, tmp, strlen(tmp)); + +out: + btrfs_free_path(path); + return ret; +} + +enum inode_state { + inode_state_no_change, + inode_state_will_create, + inode_state_did_create, + inode_state_will_delete, + inode_state_did_delete, +}; + +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + int left_ret; + int right_ret; + u64 left_gen; + u64 right_gen; + + ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, + NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + left_ret = ret; + + if (!sctx->parent_root) { + right_ret = -ENOENT; + } else { + ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, + NULL, NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + right_ret = ret; + } + + if (!left_ret && !right_ret) { + if (left_gen == gen && right_gen == gen) { + ret = inode_state_no_change; + } else if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else if (!left_ret) { + if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else { + ret = -ENOENT; + } + } else if (!right_ret) { + if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else { + ret = -ENOENT; + } + +out: + return ret; +} + +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + + ret = get_cur_inode_state(sctx, ino, gen); + if (ret < 0) + goto out; + + if (ret == inode_state_no_change || + ret == inode_state_did_create || + ret == inode_state_will_delete) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Helper function to lookup a dir item in a dir. + */ +static int lookup_dir_item_inode(struct btrfs_root *root, + u64 dir, const char *name, int name_len, + u64 *found_inode, + u8 *found_type) +{ + int ret = 0; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path *path; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(NULL, root, path, + dir, name, name_len, 0); + if (!di) { + ret = -ENOENT; + goto out; + } + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } + *found_inode = key.objectid; + *found_type = btrfs_dir_type(path->nodes[0], di); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */ +static int get_first_ref(struct btrfs_root *root, u64 ino, + u64 *dir, u64 *dir_gen, struct fs_path *name) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int len; + u64 parent_dir; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (!ret) + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (ret || found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = -ENOENT; + goto out; + } + + if (found_key.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), + len); + parent_dir = found_key.offset; + } else { + struct btrfs_inode_extref *extref; + extref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_extref); + len = btrfs_inode_extref_name_len(path->nodes[0], extref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)&extref->name, len); + parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); + } + if (ret < 0) + goto out; + btrfs_release_path(path); + + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } + + *dir = parent_dir; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_first_ref(struct btrfs_root *root, + u64 ino, u64 dir, + const char *name, int name_len) +{ + int ret; + struct fs_path *tmp_name; + u64 tmp_dir; + + tmp_name = fs_path_alloc(); + if (!tmp_name) + return -ENOMEM; + + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); + if (ret < 0) + goto out; + + if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { + ret = 0; + goto out; + } + + ret = !memcmp(tmp_name->start, name, name_len); + +out: + fs_path_free(tmp_name); + return ret; +} + +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */ +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + const char *name, int name_len, + u64 *who_ino, u64 *who_gen) +{ + int ret = 0; + u64 gen; + u64 other_inode = 0; + u8 other_type = 0; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + /* + * If we have a parent root we need to verify that the parent dir was + * not delted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + */ + if (sctx->parent_root) { + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, + &other_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Check if the overwritten ref was already processed. If yes, the ref + * was already unlinked/moved, so we can safely assume that we will not + * overwrite anything at this point in time. + */ + if (other_inode > sctx->send_progress) { + ret = get_inode_info(sctx->parent_root, other_inode, NULL, + who_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + + ret = 1; + *who_ino = other_inode; + } else { + ret = 0; + } + +out: + return ret; +} + +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */ +static int did_overwrite_ref(struct send_ctx *sctx, + u64 dir, u64 dir_gen, + u64 ino, u64 ino_gen, + const char *name, int name_len) +{ + int ret = 0; + u64 gen; + u64 ow_inode; + u8 other_type; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + /* check if the ref was overwritten by another ref */ + ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, + &ow_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + /* was never and will never be overwritten */ + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, + NULL, NULL); + if (ret < 0) + goto out; + + if (ow_inode == ino && gen == ino_gen) { + ret = 0; + goto out; + } + + /* we know that it is or will be overwritten. check this now */ + if (ow_inode < sctx->send_progress) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */ +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 dir; + u64 dir_gen; + + if (!sctx->parent_root) + goto out; + + name = fs_path_alloc(); + if (!name) + return -ENOMEM; + + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); + if (ret < 0) + goto out; + + ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, + name->start, fs_path_len(name)); + +out: + fs_path_free(name); + return ret; +} + +/* + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * so we need to do some special handling in case we have clashes. This function + * takes care of this with the help of name_cache_entry::radix_list. + * In case of error, nce is kfreed. + */ +static int name_cache_insert(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + int ret = 0; + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); + if (!nce_head) { + kfree(nce); + return -ENOMEM; + } + INIT_LIST_HEAD(nce_head); + + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); + if (ret < 0) { + kfree(nce_head); + kfree(nce); + return ret; + } + } + list_add_tail(&nce->radix_list, nce_head); + list_add_tail(&nce->list, &sctx->name_cache_list); + sctx->name_cache_size++; + + return ret; +} + +static void name_cache_delete(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } + + list_del(&nce->radix_list); + list_del(&nce->list); + sctx->name_cache_size--; + + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + kfree(nce_head); + } +} + +static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) +{ + struct list_head *nce_head; + struct name_cache_entry *cur; + + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + if (!nce_head) + return NULL; + + list_for_each_entry(cur, nce_head, radix_list) { + if (cur->ino == ino && cur->gen == gen) + return cur; + } + return NULL; +} + +/* + * Removes the entry from the list and adds it back to the end. This marks the + * entry as recently used so that name_cache_clean_unused does not remove it. + */ +static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) +{ + list_del(&nce->list); + list_add_tail(&nce->list, &sctx->name_cache_list); +} + +/* + * Remove some entries from the beginning of name_cache_list. + */ +static void name_cache_clean_unused(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + + if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) + return; + + while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); + name_cache_delete(sctx, nce); + kfree(nce); + } +} + +static void name_cache_free(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + + while (!list_empty(&sctx->name_cache_list)) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); + name_cache_delete(sctx, nce); + kfree(nce); + } +} + +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */ +static int __get_cur_name_and_parent(struct send_ctx *sctx, + u64 ino, u64 gen, + u64 *parent_ino, + u64 *parent_gen, + struct fs_path *dest) +{ + int ret; + int nce_ret; + struct name_cache_entry *nce = NULL; + + /* + * First check if we already did a call to this function with the same + * ino/gen. If yes, check if the cache entry is still up-to-date. If yes + * return the cached result. + */ + nce = name_cache_search(sctx, ino, gen); + if (nce) { + if (ino < sctx->send_progress && nce->need_later_update) { + name_cache_delete(sctx, nce); + kfree(nce); + nce = NULL; + } else { + name_cache_used(sctx, nce); + *parent_ino = nce->parent_ino; + *parent_gen = nce->parent_gen; + ret = fs_path_add(dest, nce->name, nce->name_len); + if (ret < 0) + goto out; + ret = nce->ret; + goto out; + } + } + + /* + * If the inode is not existent yet, add the orphan name and return 1. + * This should only happen for the parent dir that we determine in + * __record_new_ref + */ + ret = is_inode_existent(sctx, ino, gen); + if (ret < 0) + goto out; + + if (!ret) { + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + goto out_cache; + } + + /* + * Depending on whether the inode was already processed or not, use + * send_root or parent_root for ref lookup. + */ + if (ino < sctx->send_progress) + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); + else + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); + if (ret < 0) + goto out; + + /* + * Check if the ref was overwritten by an inode's ref that was processed + * earlier. If yes, treat as orphan and return 1. + */ + ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, + dest->start, dest->end - dest->start); + if (ret < 0) + goto out; + if (ret) { + fs_path_reset(dest); + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + } + +out_cache: + /* + * Store the result of the lookup in the name cache. + */ + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); + if (!nce) { + ret = -ENOMEM; + goto out; + } + + nce->ino = ino; + nce->gen = gen; + nce->parent_ino = *parent_ino; + nce->parent_gen = *parent_gen; + nce->name_len = fs_path_len(dest); + nce->ret = ret; + strcpy(nce->name, dest->start); + + if (ino < sctx->send_progress) + nce->need_later_update = 0; + else + nce->need_later_update = 1; + + nce_ret = name_cache_insert(sctx, nce); + if (nce_ret < 0) + ret = nce_ret; + name_cache_clean_unused(sctx); + +out: + return ret; +} + +/* + * Magic happens here. This function returns the first ref to an inode as it + * would look like while receiving the stream at this point in time. + * We walk the path up to the root. For every inode in between, we check if it + * was already processed/sent. If yes, we continue with the parent as found + * in send_root. If not, we continue with the parent as found in parent_root. + * If we encounter an inode that was deleted at this point in time, we use the + * inodes "orphan" name instead of the real name and stop. Same with new inodes + * that were not created yet and overwritten inodes/refs. + * + * When do we have have orphan inodes: + * 1. When an inode is freshly created and thus no valid refs are available yet + * 2. When a directory lost all it's refs (deleted) but still has dir items + * inside which were not processed yet (pending for move/delete). If anyone + * tried to get the path to the dir items, it would get a path inside that + * orphan directory. + * 3. When an inode is moved around or gets new links, it may overwrite the ref + * of an unprocessed inode. If in that case the first ref would be + * overwritten, the overwritten inode gets "orphanized". Later when we + * process this overwritten inode, it is restored at a new place by moving + * the orphan inode. + * + * sctx->send_progress tells this function at which point in time receiving + * would be. + */ +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 parent_inode = 0; + u64 parent_gen = 0; + int stop = 0; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + + dest->reversed = 1; + fs_path_reset(dest); + + while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + + if (ret < 0) + goto out; + + ret = fs_path_add_path(dest, name); + if (ret < 0) + goto out; + + ino = parent_inode; + gen = parent_gen; + } + +out: + fs_path_free(name); + if (!ret) + fs_path_unreverse(dest); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace + */ +static int send_subvol_begin(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_root *parent_root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + char *name = NULL; + int namelen; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); + if (!name) { + btrfs_free_path(path); + return -ENOMEM; + } + + key.objectid = send_root->objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, + &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type != BTRFS_ROOT_BACKREF_KEY || + key.objectid != send_root->objectid) { + ret = -ENOENT; + goto out; + } + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + namelen = btrfs_root_ref_name_len(leaf, ref); + read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); + btrfs_release_path(path); + + if (parent_root) { + ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); + if (ret < 0) + goto out; + } else { + ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); + if (ret < 0) + goto out; + } + + TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, + le64_to_cpu(sctx->send_root->root_item.ctransid)); + if (parent_root) { + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + sctx->parent_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + le64_to_cpu(sctx->parent_root->root_item.ctransid)); + } + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + btrfs_free_path(path); + kfree(name); + return ret; +} + +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *p = NULL; + struct btrfs_inode_item *ii; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + struct btrfs_key key; + int slot; + +verbose_printk("btrfs: send_utimes %llu\n", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + eb = path->nodes[0]; + slot = path->slots[0]; + ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, + btrfs_inode_atime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, + btrfs_inode_mtime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, + btrfs_inode_ctime(ii)); + /* TODO Add otime support when the otime patches get into upstream */ + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + btrfs_free_path(path); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have + * a valid path yet because we did not process the refs yet. So, the inode + * is created as orphan. + */ +static int send_create_inode(struct send_ctx *sctx, u64 ino) +{ + int ret = 0; + struct fs_path *p; + int cmd; + u64 gen; + u64 mode; + u64 rdev; + +verbose_printk("btrfs: send_create_inode %llu\n", ino); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } + + if (S_ISREG(mode)) { + cmd = BTRFS_SEND_C_MKFILE; + } else if (S_ISDIR(mode)) { + cmd = BTRFS_SEND_C_MKDIR; + } else if (S_ISLNK(mode)) { + cmd = BTRFS_SEND_C_SYMLINK; + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { + cmd = BTRFS_SEND_C_MKNOD; + } else if (S_ISFIFO(mode)) { + cmd = BTRFS_SEND_C_MKFIFO; + } else if (S_ISSOCK(mode)) { + cmd = BTRFS_SEND_C_MKSOCK; + } else { + printk(KERN_WARNING "btrfs: unexpected inode type %o", + (int)(mode & S_IFMT)); + ret = -ENOTSUPP; + goto out; + } + + ret = begin_cmd(sctx, cmd); + if (ret < 0) + goto out; + + ret = gen_unique_name(sctx, ino, gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); + + if (S_ISLNK(mode)) { + fs_path_reset(p); + ret = read_symlink(sctx->send_root, ino, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); + } + + ret = send_cmd(sctx); + if (ret < 0) + goto out; + + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ + int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key di_key; + struct extent_buffer *eb; + struct btrfs_dir_item *di; + int slot; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { + ret = 1; + goto out; + } + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + * directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ + int ret; + + if (S_ISDIR(sctx->cur_inode_mode)) { + ret = did_create_dir(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + } + + ret = send_create_inode(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + +out: + return ret; +} + +struct recorded_ref { + struct list_head list; + char *dir_path; + char *name; + struct fs_path *full_path; + u64 dir; + u64 dir_gen; + int dir_path_len; + int name_len; +}; + +/* + * We need to process new refs before deleted refs, but compare_tree gives us + * everything mixed. So we first record all refs and later process them. + * This function is a helper to record one ref. + */ +static int __record_ref(struct list_head *head, u64 dir, + u64 dir_gen, struct fs_path *path) +{ + struct recorded_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->dir = dir; + ref->dir_gen = dir_gen; + ref->full_path = path; + + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; + ref->dir_path = ref->full_path->start; + if (ref->name == ref->full_path->start) + ref->dir_path_len = 0; + else + ref->dir_path_len = ref->full_path->end - + ref->full_path->start - 1 - ref->name_len; + + list_add_tail(&ref->list, head); + return 0; +} + +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = kmalloc(sizeof(*ref), GFP_NOFS); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + new->full_path = NULL; + INIT_LIST_HEAD(&new->list); + list_add_tail(&new->list, list); + return 0; +} + +static void __free_recorded_refs(struct list_head *head) +{ + struct recorded_ref *cur; + + while (!list_empty(head)) { + cur = list_entry(head->next, struct recorded_ref, list); + fs_path_free(cur->full_path); + list_del(&cur->list); + kfree(cur); + } +} + +static void free_recorded_refs(struct send_ctx *sctx) +{ + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); +} + +/* + * Renames/moves a file/dir to its orphan name. Used when the first + * ref of an unprocessed inode gets overwritten and for all non empty + * directories. + */ +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *path) +{ + int ret; + struct fs_path *orphan; + + orphan = fs_path_alloc(); + if (!orphan) + return -ENOMEM; + + ret = gen_unique_name(sctx, ino, gen, orphan); + if (ret < 0) + goto out; + + ret = send_rename(sctx, path, orphan); + +out: + fs_path_free(orphan); + return ret; +} + +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + +/* + * Returns 1 if a directory can be removed at this point in time. + * We check this by iterating all dir items and checking if the inode behind + * the dir item was already processed. + */ +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) +{ + int ret = 0; + struct btrfs_root *root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key loc; + struct btrfs_dir_item *di; + + /* + * Don't try to rmdir the top/root subvolume dir. + */ + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) + break; + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + + if (loc.objectid > send_progress) { + ret = 0; + goto out; + } + + path->slots[0]++; + } + + ret = 1; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + + return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + dm->rmdir_ino = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry = NULL, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = ino; + pm->gen = ino_gen; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + struct fs_path *name = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; + int ret; + u64 ancestor = 0; + + name = fs_path_alloc(); + from_path = fs_path_alloc(); + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } + + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } + goto out; + } + fs_path_reset(name); + to_path = name; + name = NULL; + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(name); + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref) +{ + int ret = 0; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + path_after = fs_path_alloc(); + path_before = fs_path_alloc(); + if (!path_after || !path_before) { + ret = -ENOMEM; + goto out; + } + + /* + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. + */ + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret < 0 && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 1; + break; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + break; + } + ino = parent_ino_after; + } + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs); + if (!ret) + ret = 1; + } + + return ret; +} + +/* + * This does all the move/link/unlink/rmdir magic. + */ +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) +{ + int ret = 0; + struct recorded_ref *cur; + struct recorded_ref *cur2; + struct list_head check_dirs; + struct fs_path *valid_path = NULL; + u64 ow_inode = 0; + u64 ow_gen; + int did_overwrite = 0; + int is_orphan = 0; + u64 last_dir_ino_rm = 0; + +verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); + + /* + * This should never happen as the root dir always has the same ref + * which is always '..' + */ + BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + INIT_LIST_HEAD(&check_dirs); + + valid_path = fs_path_alloc(); + if (!valid_path) { + ret = -ENOMEM; + goto out; + } + + /* + * First, check if the first ref of the current inode was overwritten + * before. If yes, we know that the current inode was already orphanized + * and thus use the orphan name. If not, we can use get_cur_path to + * get the path of the first ref as it would like while receiving at + * this point in time. + * New inodes are always orphan at the beginning, so force to use the + * orphan name in this case. + * The first ref is stored in valid_path and will be updated if it + * gets moved around. + */ + if (!sctx->cur_inode_new) { + ret = did_overwrite_first_ref(sctx, sctx->cur_ino, + sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) + did_overwrite = 1; + } + if (sctx->cur_inode_new || did_overwrite) { + ret = gen_unique_name(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } else { + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + valid_path); + if (ret < 0) + goto out; + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * the the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + + /* + * Check if this new ref would overwrite the first ref of + * another unprocessed inode. If yes, orphanize the + * overwritten inode. If we find an overwritten ref that is + * not the first ref, simply unlink it. + */ + ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, + cur->name, cur->name_len, + &ow_inode, &ow_gen); + if (ret < 0) + goto out; + if (ret) { + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); + if (ret < 0) + goto out; + if (ret) { + ret = orphanize_inode(sctx, ow_inode, ow_gen, + cur->full_path); + if (ret < 0) + goto out; + } else { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + + /* + * link/move the ref to the new place. If we have an orphan + * inode, move it and update valid_path. If not, link or move + * it depending on the inode mode. + */ + if (is_orphan) { + ret = send_rename(sctx, valid_path, cur->full_path); + if (ret < 0) + goto out; + is_orphan = 0; + ret = fs_path_copy(valid_path, cur->full_path); + if (ret < 0) + goto out; + } else { + if (S_ISDIR(sctx->cur_inode_mode)) { + /* + * Dirs can't be linked, so move it. For moved + * dirs, we always have one new and one deleted + * ref. The deleted ref is ignored later. + */ + ret = wait_for_parent_move(sctx, cur); + if (ret < 0) + goto out; + if (ret) { + *pending_move = 1; + } else { + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + } + if (ret < 0) + goto out; + } else { + ret = send_link(sctx, cur->full_path, + valid_path); + if (ret < 0) + goto out; + } + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { + /* + * Check if we can already rmdir the directory. If not, + * orphanize it. For every dir item inside that gets deleted + * later, we do this check again and rmdir it then if possible. + * See the use of check_dirs for more details. + */ + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + } else if (!is_orphan) { + ret = orphanize_inode(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + } else if (S_ISDIR(sctx->cur_inode_mode) && + !list_empty(&sctx->deleted_refs)) { + /* + * We have a moved dir. Add the old parent to check_dirs + */ + cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, + list); + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } else if (!S_ISDIR(sctx->cur_inode_mode)) { + /* + * We have a non dir inode. Go through all deleted refs and + * unlink them if they were not already overwritten by other + * inodes. + */ + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino, sctx->cur_inode_gen, + cur->name, cur->name_len); + if (ret < 0) + goto out; + if (!ret) { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + ret = dup_ref(cur, &check_dirs); + if (ret < 0) + goto out; + } + /* + * If the inode is still orphan, unlink the orphan. This may + * happen when a previous inode did overwrite the first ref + * of this inode and no new refs were added for the current + * inode. Unlinking does not mean that the inode is deleted in + * all cases. There may still be links to this inode in other + * places. + */ + if (is_orphan) { + ret = send_unlink(sctx, valid_path); + if (ret < 0) + goto out; + } + } + + /* + * We did collect all parent dirs where cur_inode was once located. We + * now go through all these dirs and check if they are pending for + * deletion and if it's finally possible to perform the rmdir now. + * We also update the inode stats of the parent dirs here. + */ + list_for_each_entry(cur, &check_dirs, list) { + /* + * In case we had refs into dirs that were not processed yet, + * we don't need to do the utime and rmdir logic for these dirs. + * The dir will be processed later. + */ + if (cur->dir > sctx->cur_ino) + continue; + + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + + if (ret == inode_state_did_create || + ret == inode_state_no_change) { + /* TODO delayed utimes */ + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + last_dir_ino_rm = cur->dir; + } + } + } + + ret = 0; + +out: + __free_recorded_refs(&check_dirs); + free_recorded_refs(sctx); + fs_path_free(valid_path); + return ret; +} + +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct fs_path *p; + u64 gen; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, + NULL, NULL); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, dir, gen, p); + if (ret < 0) + goto out; + ret = fs_path_add_path(p, name); + if (ret < 0) + goto out; + + ret = __record_ref(refs, dir, gen, p); + +out: + if (ret) + fs_path_free(p); + return ret; +} + +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + +static int __record_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); +} + +static int record_new_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +static int record_deleted_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +struct find_ref_ctx { + u64 dir; + u64 dir_gen; + struct btrfs_root *root; + struct fs_path *name; + int found_idx; +}; + +static int __find_iref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx_) +{ + struct find_ref_ctx *ctx = ctx_; + u64 dir_gen; + int ret; + + if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && + strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + /* + * To avoid doing extra lookups we'll only do this if everything + * else matches. + */ + ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + if (dir_gen != ctx->dir_gen) + return 0; + ctx->found_idx = num; + return 1; + } + return 0; +} + +static int find_iref(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 dir, u64 dir_gen, struct fs_path *name) +{ + int ret; + struct find_ref_ctx ctx; + + ctx.dir = dir; + ctx.name = name; + ctx.dir_gen = dir_gen; + ctx.found_idx = -1; + ctx.root = root; + + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + + return ctx.found_idx; +} + +static int __record_changed_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + u64 dir_gen; + int ret; + struct send_ctx *sctx = ctx; + + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, dir, dir_gen, name); + if (ret == -ENOENT) + ret = __record_new_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int __record_changed_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + u64 dir_gen; + int ret; + struct send_ctx *sctx = ctx; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, + dir, dir_gen, name); + if (ret == -ENOENT) + ret = __record_deleted_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int record_changed_ref(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_changed_new_ref, sctx); + if (ret < 0) + goto out; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +/* + * Record and process all refs at once. Needed when an inode changes the + * generation number, which means that it was deleted and recreated. + */ +static int process_all_refs(struct send_ctx *sctx, + enum btrfs_compare_tree_result cmd) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + iterate_inode_ref_t cb; + int pending_move = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + if (cmd == BTRFS_COMPARE_TREE_NEW) { + root = sctx->send_root; + cb = __record_new_ref; + } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { + root = sctx->parent_root; + cb = __record_deleted_ref; + } else { + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; + } + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) + break; + + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + if (ret < 0) + goto out; + + path->slots[0]++; + } + btrfs_release_path(path); + + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); + +out: + btrfs_free_path(path); + return ret; +} + +static int send_set_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len, + const char *data, int data_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int send_remove_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int __process_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + posix_acl_xattr_header dummy_acl; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + /* + * This hack is needed because empty acl's are stored as zero byte + * data in xattrs. Problem with that is, that receiving these zero byte + * acl's will fail later. To fix this, we send a dummy acl list that + * only contains the version number and no entries. + */ + if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || + !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { + if (data_len == 0) { + dummy_acl.a_version = + cpu_to_le32(POSIX_ACL_XATTR_VERSION); + data = (char *)&dummy_acl; + data_len = sizeof(dummy_acl); + } + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_set_xattr(sctx, p, name, name_len, data, data_len); + +out: + fs_path_free(p); + return ret; +} + +static int __process_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_remove_xattr(sctx, p, name, name_len); + +out: + fs_path_free(p); + return ret; +} + +static int process_new_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); + + return ret; +} + +static int process_deleted_xattr(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); + + return ret; +} + +struct find_xattr_ctx { + const char *name; + int name_len; + int found_idx; + char *found_data; + int found_data_len; +}; + +static int __find_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *vctx) +{ + struct find_xattr_ctx *ctx = vctx; + + if (name_len == ctx->name_len && + strncmp(name, ctx->name, name_len) == 0) { + ctx->found_idx = num; + ctx->found_data_len = data_len; + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); + if (!ctx->found_data) + return -ENOMEM; + return 1; + } + return 0; +} + +static int find_xattr(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + const char *name, int name_len, + char **data, int *data_len) +{ + int ret; + struct find_xattr_ctx ctx; + + ctx.name = name; + ctx.name_len = name_len; + ctx.found_idx = -1; + ctx.found_data = NULL; + ctx.found_data_len = 0; + + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + if (data) { + *data = ctx.found_data; + *data_len = ctx.found_data_len; + } else { + kfree(ctx.found_data); + } + return ctx.found_idx; +} + + +static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + char *found_data = NULL; + int found_data_len = 0; + + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); + if (ret == -ENOENT) { + ret = __process_new_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + } else if (ret >= 0) { + if (data_len != found_data_len || + memcmp(data, found_data, data_len)) { + ret = __process_new_xattr(num, di_key, name, name_len, + data, data_len, type, ctx); + } else { + ret = 0; + } + } + + kfree(found_data); + return ret; +} + +static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); + if (ret == -ENOENT) + ret = __process_deleted_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + else if (ret >= 0) + ret = 0; + + return ret; +} + +static int process_changed_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_changed_new_xattr, sctx); + if (ret < 0) + goto out; + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_changed_deleted_xattr, sctx); + +out: + return ret; +} + +static int process_all_new_xattrs(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + root = sctx->send_root; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); + if (ret < 0) + goto out; + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct page *page; + char *addr; + struct btrfs_key key; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + ssize_t ret = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (offset + len > i_size_read(inode)) { + if (offset > i_size_read(inode)) + len = 0; + else + len = offset - i_size_read(inode); + } + if (len == 0) + goto out; + + last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_CACHE_SIZE - pg_offset); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + break; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + addr = kmap(page); + memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + kunmap(page); + unlock_page(page); + page_cache_release(page); + index++; + pg_offset = 0; + len -= cur_len; + ret += cur_len; + } +out: + iput(inode); + return ret; +} + +/* + * Read some bytes from the current inode/file and send a write command to + * user space. + */ +static int send_write(struct send_ctx *sctx, u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + ssize_t num_read = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + +verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); + + num_read = fill_read_buf(sctx, offset, len); + if (num_read <= 0) { + if (num_read < 0) + ret = num_read; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + if (ret < 0) + return ret; + return num_read; +} + +/* + * Send a clone command to user space. + */ +static int send_clone(struct send_ctx *sctx, + u64 offset, u32 len, + struct clone_root *clone_root) +{ + int ret = 0; + struct fs_path *p; + u64 gen; + +verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " + "clone_inode=%llu, clone_offset=%llu\n", offset, len, + clone_root->root->objectid, clone_root->ino, + clone_root->offset); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + + if (clone_root->root == sctx->send_root) { + ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, + &gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, clone_root->ino, gen, p); + } else { + ret = get_inode_path(clone_root->root, clone_root->ino, p); + } + if (ret < 0) + goto out; + + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + le64_to_cpu(clone_root->root->root_item.ctransid)); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, + clone_root->offset); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + +static int send_write_or_clone(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key, + struct clone_root *clone_root) +{ + int ret = 0; + struct btrfs_file_extent_item *ei; + u64 offset = key->offset; + u64 pos = 0; + u64 len; + u32 l; + u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); + /* + * it is possible the inline item won't cover the whole page, + * but there may be items after this page. Make + * sure to send the whole thing + */ + len = PAGE_CACHE_ALIGN(len); + } else { + len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + } + + if (offset + len > sctx->cur_inode_size) + len = sctx->cur_inode_size - offset; + if (len == 0) { + ret = 0; + goto out; + } + + if (clone_root && IS_ALIGNED(offset + len, bs)) { + ret = send_clone(sctx, offset, len, clone_root); + } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = send_update_extent(sctx, offset, len); + } else { + while (pos < len) { + l = len - pos; + if (l > BTRFS_SEND_READ_SIZE) + l = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, pos + offset, l); + if (ret < 0) + goto out; + if (!ret) + break; + pos += ret; + } + ret = 0; + } +out: + return ret; +} + +static int is_extent_unchanged(struct send_ctx *sctx, + struct btrfs_path *left_path, + struct btrfs_key *ekey) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + struct btrfs_key found_key; + struct btrfs_file_extent_item *ei; + u64 left_disknr; + u64 right_disknr; + u64 left_offset; + u64 right_offset; + u64 left_offset_fixed; + u64 left_len; + u64 right_len; + u64 left_gen; + u64 right_gen; + u8 left_type; + u8 right_type; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + eb = left_path->nodes[0]; + slot = left_path->slots[0]; + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + left_type = btrfs_file_extent_type(eb, ei); + + if (left_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + left_gen = btrfs_file_extent_generation(eb, ei); + + /* + * Following comments will refer to these graphics. L is the left + * extents which we are checking at the moment. 1-8 are the right + * extents that we iterate. + * + * |-----L-----| + * |-1-|-2a-|-3-|-4-|-5-|-6-| + * + * |-----L-----| + * |--1--|-2b-|...(same as above) + * + * Alternative situation. Happens on files where extents got split. + * |-----L-----| + * |-----------7-----------|-6-| + * + * Alternative situation. Happens on files which got larger. + * |-----L-----| + * |-8-| + * Nothing follows after 8. + */ + + key.objectid = ekey->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ekey->offset; + ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Handle special case where the right side has no extents at all. + */ + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + /* + * We're now on 2a, 2b or 7. + */ + key = found_key; + while (key.offset < ekey->offset + left_len) { + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + right_type = btrfs_file_extent_type(eb, ei); + if (right_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + + /* + * Are we at extent 8? If yes, we know the extent is changed. + * This may only happen on the first iteration. + */ + if (found_key.offset + right_len <= ekey->offset) { + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; + goto out; + } + + left_offset_fixed = left_offset; + if (key.offset < ekey->offset) { + /* Fix the right offset for 2a and 7. */ + right_offset += ekey->offset - key.offset; + } else { + /* Fix the left offset for all behind 2a and 2b */ + left_offset_fixed += key.offset - ekey->offset; + } + + /* + * Check if we have the same extent. + */ + if (left_disknr != right_disknr || + left_offset_fixed != right_offset || + left_gen != right_gen) { + ret = 0; + goto out; + } + + /* + * Go to the next extent. + */ + ret = btrfs_next_item(sctx->parent_root, path); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + key.offset += right_len; + break; + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; + } + key = found_key; + } + + /* + * We're now behind the left extent (treat as unchanged) or at the end + * of the right side (treat as changed). + */ + if (key.offset >= ekey->offset + left_len) + ret = 1; + else + ret = 0; + + +out: + btrfs_free_path(path); + return ret; +} + +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + +static int process_extent(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct clone_root *found_clone = NULL; + int ret = 0; + + if (S_ISLNK(sctx->cur_inode_mode)) + return 0; + + if (sctx->parent_root && !sctx->cur_inode_new) { + ret = is_extent_unchanged(sctx, path, key); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out_hole; + } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } + } + } + + ret = find_extent_clone(sctx, path, key->objectid, key->offset, + sctx->cur_inode_size, &found_clone); + if (ret != -ENOENT && ret < 0) + goto out; + + ret = send_write_or_clone(sctx, path, key, found_clone); + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); +out: + return ret; +} + +static int process_all_extents(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + root = sctx->send_root; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = process_extent(sctx, path, &found_key); + if (ret < 0) + goto out; + + path->slots[0]++; + } + +out: + btrfs_free_path(path); + return ret; +} + +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) +{ + int ret = 0; + + if (sctx->cur_ino == 0) + goto out; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && + sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) + goto out; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) + goto out; + + ret = process_recorded_refs(sctx, pending_move); + if (ret < 0) + goto out; + + *refs_processed = 1; +out: + return ret; +} + +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +{ + int ret = 0; + u64 left_mode; + u64 left_uid; + u64 left_gid; + u64 right_mode; + u64 right_uid; + u64 right_gid; + int need_chmod = 0; + int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; + + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); + if (ret < 0) + goto out; + + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) + goto out; + if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) + goto out; + + ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, + &left_mode, &left_uid, &left_gid, NULL); + if (ret < 0) + goto out; + + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode)) + need_chmod = 1; + } else { + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, + NULL, NULL, &right_mode, &right_uid, + &right_gid, NULL); + if (ret < 0) + goto out; + + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) + need_chmod = 1; + } + + if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } + ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_inode_size); + if (ret < 0) + goto out; + } + + if (need_chown) { + ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_uid, left_gid); + if (ret < 0) + goto out; + } + if (need_chmod) { + ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_mode); + if (ret < 0) + goto out; + } + + /* + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. + */ + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + } + +out: + return ret; +} + +static int changed_inode(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + struct btrfs_key *key = sctx->cmp_key; + struct btrfs_inode_item *left_ii = NULL; + struct btrfs_inode_item *right_ii = NULL; + u64 left_gen = 0; + u64 right_gen = 0; + + sctx->cur_ino = key->objectid; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; + + /* + * Set send_progress to current inode. This will tell all get_cur_xxx + * functions that the current inode's refs are not updated yet. Later, + * when process_recorded_refs is finished, it is set to cur_ino + 1. + */ + sctx->send_progress = sctx->cur_ino; + + if (result == BTRFS_COMPARE_TREE_NEW || + result == BTRFS_COMPARE_TREE_CHANGED) { + left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], + sctx->left_path->slots[0], + struct btrfs_inode_item); + left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], + left_ii); + } else { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + } + if (result == BTRFS_COMPARE_TREE_CHANGED) { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + + /* + * The cur_ino = root dir case is special here. We can't treat + * the inode as deleted+reused because it would generate a + * stream that tries to delete/mkdir the root dir. + */ + if (left_gen != right_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + sctx->cur_inode_new_gen = 1; + } + + if (result == BTRFS_COMPARE_TREE_NEW) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); + if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + ret = send_create_inode_if_needed(sctx); + } else if (result == BTRFS_COMPARE_TREE_DELETED) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + /* + * We need to do some special handling in case the inode was + * reported as changed with a changed generation number. This + * means that the original inode was deleted and new inode + * reused the same inum. So we have to treat the old inode as + * deleted and the new one as new. + */ + if (sctx->cur_inode_new_gen) { + /* + * First, process the inode as if it was deleted. + */ + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + + /* + * Now process the inode as if it was new. + */ + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; + + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get into + * process_recorded_refs_if_needed in the new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * Now process all extents and xattrs of the inode as if + * they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } else { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + } + } + +out: + return ret; +} + +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */ +static int changed_ref(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = record_new_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = record_deleted_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = record_changed_ref(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */ +static int changed_xattr(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = process_new_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = process_deleted_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = process_changed_xattr(sctx); + } + + return ret; +} + +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */ +static int changed_extent(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result != BTRFS_COMPARE_TREE_DELETED) + ret = process_extent(sctx, sctx->left_path, + sctx->cmp_key); + } + + return ret; +} + +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, + NULL, NULL); + if (ret) + return ret; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */ +static int changed_cb(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { + return 0; + } + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + + sctx->left_path = left_path; + sctx->right_path = right_path; + sctx->cmp_key = key; + + ret = finish_inode_if_needed(sctx, 0); + if (ret < 0) + goto out; + + /* Ignore non-FS objects */ + if (key->objectid == BTRFS_FREE_INO_OBJECTID || + key->objectid == BTRFS_FREE_SPACE_OBJECTID) + goto out; + + if (key->type == BTRFS_INODE_ITEM_KEY) + ret = changed_inode(sctx, result); + else if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) + ret = changed_ref(sctx, result); + else if (key->type == BTRFS_XATTR_ITEM_KEY) + ret = changed_xattr(sctx, result); + else if (key->type == BTRFS_EXTENT_DATA_KEY) + ret = changed_extent(sctx, result); + +out: + return ret; +} + +static int full_send_tree(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *eb; + int slot; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) + goto out_finish; + + while (1) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + ret = changed_cb(send_root, NULL, path, NULL, + &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + if (ret < 0) + goto out; + + key.objectid = found_key.objectid; + key.type = found_key.type; + key.offset = found_key.offset + 1; + + ret = btrfs_next_item(send_root, path); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + break; + } + } + +out_finish: + ret = finish_inode_if_needed(sctx, 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int send_subvol(struct send_ctx *sctx) +{ + int ret; + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { + ret = send_header(sctx); + if (ret < 0) + goto out; + } + + ret = send_subvol_begin(sctx); + if (ret < 0) + goto out; + + if (sctx->parent_root) { + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, + changed_cb, sctx); + if (ret < 0) + goto out; + ret = finish_inode_if_needed(sctx, 1); + if (ret < 0) + goto out; + } else { + ret = full_send_tree(sctx); + if (ret < 0) + goto out; + } + +out: + free_recorded_refs(sctx); + return ret; +} + +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +{ + int ret = 0; + struct btrfs_root *send_root; + struct btrfs_root *clone_root; + struct btrfs_fs_info *fs_info; + struct btrfs_ioctl_send_args *arg = NULL; + struct btrfs_key key; + struct send_ctx *sctx = NULL; + u32 i; + u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + send_root = BTRFS_I(file_inode(mnt_file))->root; + fs_info = send_root->fs_info; + + /* + * The subvolume must remain read-only during send, protect against + * making it RW. This also protects against deletion. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* + * This is done when we lookup the root, it should already be complete + * by the time we get here. + */ + WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); + + /* + * Userspace tools do the checks and warn the user if it's + * not RO. + */ + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; + } + + arg = memdup_user(arg_, sizeof(*arg)); + if (IS_ERR(arg)) { + ret = PTR_ERR(arg); + arg = NULL; + goto out; + } + + if (!access_ok(VERIFY_READ, arg->clone_sources, + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { + ret = -EFAULT; + goto out; + } + + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { + ret = -EINVAL; + goto out; + } + + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); + if (!sctx) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&sctx->new_refs); + INIT_LIST_HEAD(&sctx->deleted_refs); + INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); + INIT_LIST_HEAD(&sctx->name_cache_list); + + sctx->flags = arg->flags; + + sctx->send_filp = fget(arg->send_fd); + if (!sctx->send_filp) { + ret = -EBADF; + goto out; + } + + sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + + sctx->clone_roots_cnt = arg->clone_sources_count; + + sctx->send_max_size = BTRFS_SEND_BUF_SIZE; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * + (arg->clone_sources_count + 1)); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } + + if (arg->clone_sources_count) { + clone_sources_tmp = vmalloc(arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(clone_sources_tmp, arg->clone_sources, + arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (ret) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < arg->clone_sources_count; i++) { + key.objectid = clone_sources_tmp[i]; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(clone_root); + goto out; + } + clone_sources_to_rollback = i + 1; + spin_lock(&clone_root->root_item_lock); + clone_root->send_in_progress++; + if (!btrfs_root_readonly(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + + sctx->clone_roots[i].root = clone_root; + } + vfree(clone_sources_tmp); + clone_sources_tmp = NULL; + } + + if (arg->parent_root) { + key.objectid = arg->parent_root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(sctx->parent_root); + goto out; + } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); + } + + /* + * Clones from send_root are allowed, but only if the clone source + * is behind the current send position. This is checked while searching + * for possible clone sources. + */ + sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + + /* We do a bsearch later */ + sort(sctx->clone_roots, sctx->clone_roots_cnt, + sizeof(*sctx->clone_roots), __clone_root_cmp_sort, + NULL); + sort_clone_roots = 1; + + current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; + ret = send_subvol(sctx); + current->journal_info = NULL; + if (ret < 0) + goto out; + + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + } + +out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + + kfree(arg); + vfree(clone_sources_tmp); + + if (sctx) { + if (sctx->send_filp) + fput(sctx->send_filp); + + vfree(sctx->clone_roots); + vfree(sctx->send_buf); + vfree(sctx->read_buf); + + name_cache_free(sctx); + + kfree(sctx); + } + + return ret; +} diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h new file mode 100644 index 00000000000..48d425aef05 --- /dev/null +++ b/fs/btrfs/send.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * Copyright (C) 2012 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" + +#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +#define BTRFS_SEND_STREAM_VERSION 1 + +#define BTRFS_SEND_BUF_SIZE (1024 * 64) +#define BTRFS_SEND_READ_SIZE (1024 * 48) + +enum btrfs_tlv_type { + BTRFS_TLV_U8, + BTRFS_TLV_U16, + BTRFS_TLV_U32, + BTRFS_TLV_U64, + BTRFS_TLV_BINARY, + BTRFS_TLV_STRING, + BTRFS_TLV_UUID, + BTRFS_TLV_TIMESPEC, +}; + +struct btrfs_stream_header { + char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)]; + __le32 version; +} __attribute__ ((__packed__)); + +struct btrfs_cmd_header { + /* len excluding the header */ + __le32 len; + __le16 cmd; + /* crc including the header with zero crc field */ + __le32 crc; +} __attribute__ ((__packed__)); + +struct btrfs_tlv_header { + __le16 tlv_type; + /* len excluding the header */ + __le16 tlv_len; +} __attribute__ ((__packed__)); + +/* commands */ +enum btrfs_send_cmd { + BTRFS_SEND_C_UNSPEC, + + BTRFS_SEND_C_SUBVOL, + BTRFS_SEND_C_SNAPSHOT, + + BTRFS_SEND_C_MKFILE, + BTRFS_SEND_C_MKDIR, + BTRFS_SEND_C_MKNOD, + BTRFS_SEND_C_MKFIFO, + BTRFS_SEND_C_MKSOCK, + BTRFS_SEND_C_SYMLINK, + + BTRFS_SEND_C_RENAME, + BTRFS_SEND_C_LINK, + BTRFS_SEND_C_UNLINK, + BTRFS_SEND_C_RMDIR, + + BTRFS_SEND_C_SET_XATTR, + BTRFS_SEND_C_REMOVE_XATTR, + + BTRFS_SEND_C_WRITE, + BTRFS_SEND_C_CLONE, + + BTRFS_SEND_C_TRUNCATE, + BTRFS_SEND_C_CHMOD, + BTRFS_SEND_C_CHOWN, + BTRFS_SEND_C_UTIMES, + + BTRFS_SEND_C_END, + BTRFS_SEND_C_UPDATE_EXTENT, + __BTRFS_SEND_C_MAX, +}; +#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) + +/* attributes in send stream */ +enum { + BTRFS_SEND_A_UNSPEC, + + BTRFS_SEND_A_UUID, + BTRFS_SEND_A_CTRANSID, + + BTRFS_SEND_A_INO, + BTRFS_SEND_A_SIZE, + BTRFS_SEND_A_MODE, + BTRFS_SEND_A_UID, + BTRFS_SEND_A_GID, + BTRFS_SEND_A_RDEV, + BTRFS_SEND_A_CTIME, + BTRFS_SEND_A_MTIME, + BTRFS_SEND_A_ATIME, + BTRFS_SEND_A_OTIME, + + BTRFS_SEND_A_XATTR_NAME, + BTRFS_SEND_A_XATTR_DATA, + + BTRFS_SEND_A_PATH, + BTRFS_SEND_A_PATH_TO, + BTRFS_SEND_A_PATH_LINK, + + BTRFS_SEND_A_FILE_OFFSET, + BTRFS_SEND_A_DATA, + + BTRFS_SEND_A_CLONE_UUID, + BTRFS_SEND_A_CLONE_CTRANSID, + BTRFS_SEND_A_CLONE_PATH, + BTRFS_SEND_A_CLONE_OFFSET, + BTRFS_SEND_A_CLONE_LEN, + + __BTRFS_SEND_A_MAX, +}; +#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) + +#ifdef __KERNEL__ +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +#endif diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c0f7ecaf1e7..b976597b072 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,15 +17,27 @@ */ #include <linux/highmem.h> +#include <asm/unaligned.h> -/* this is some deeply nasty code. ctree.h has a different - * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef +#include "ctree.h" + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +/* + * this is some deeply nasty code. * * The end result is that anyone who #includes ctree.h gets a - * declaration for the btrfs_set_foo functions and btrfs_foo functions - * - * This file declares the macros and then #includes ctree.h, which results - * in cpp creating the function here based on the template below. + * declaration for the btrfs_set_foo functions and btrfs_foo functions, + * which are wappers of btrfs_set_token_#bits functions and + * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping * required to efficiently read and write specific fields in the extent @@ -33,107 +45,98 @@ * an unsigned long offset into the extent buffer which has been * cast to a specific type. This gives us all the gcc type checking. * - * The extent buffer api is used to do all the kmapping and page - * spanning work required to get extent buffers in highmem and have - * a metadata blocksize different from the page size. - * - * The macro starts with a simple function prototype declaration so that - * sparse won't complain about it being static. + * The extent buffer api is used to do the page spanning work required to + * have a metadata blocksize different from the page size. */ -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ +#define DEFINE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token) \ { \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - return le##bits##_to_cpu(p->member); \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - u##bits res; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + res = get_unaligned_le##bits(p + off); \ return res; \ } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ + } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ + return res; \ } \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ +void btrfs_set_token_##bits(struct extent_buffer *eb, \ + void *ptr, unsigned long off, u##bits val, \ + struct btrfs_map_token *token) \ { \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - p->member = cpu_to_le##bits(val); \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + put_unaligned_le##bits(val, p + off); \ + return; \ + } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ return; \ } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ } \ } -#include "ctree.h" +DEFINE_BTRFS_SETGET_BITS(8) +DEFINE_BTRFS_SETGET_BITS(16) +DEFINE_BTRFS_SETGET_BITS(32) +DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); - if (eb->map_token && ptr >= eb->map_start && - ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { - memcpy(disk_key, eb->kaddr + ptr - eb->map_start, - sizeof(*disk_key)); - return; - } else if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, KM_USER1); - eb->map_token = NULL; - } read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1866dff0538..8e16bca69c5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -39,76 +39,375 @@ #include <linux/miscdevice.h> #include <linux/magic.h> #include <linux/slab.h> -#include "compat.h" +#include <linux/cleancache.h> +#include <linux/ratelimit.h> +#include <linux/btrfs.h> +#include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" +#include "hash.h" +#include "props.h" #include "xattr.h" #include "volumes.h" -#include "version.h" #include "export.h" #include "compression.h" +#include "rcu-string.h" +#include "dev-replace.h" +#include "free-space-cache.h" +#include "backref.h" +#include "tests/btrfs-tests.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; -static void btrfs_put_super(struct super_block *sb) +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + +static const char *btrfs_decode_error(int errno) { - struct btrfs_root *root = btrfs_sb(sb); - int ret; + char *errstr = "unknown"; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + case -EEXIST: + errstr = "Object already exists"; + break; + case -ENOSPC: + errstr = "No space left"; + break; + case -ENOENT: + errstr = "No such entry"; + break; + } + + return errstr; +} + +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + sb->s_flags |= MS_RDONLY; + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ + } +} + +#ifdef CONFIG_PRINTK +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(errno); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT + "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, function, line, errno, errstr, &vaf); + va_end(args); + } else { + printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", + sb->s_id, function, line, errno, errstr); + } + + /* Don't go through full error handling during mount */ + save_error_info(fs_info); + if (sb->s_flags & MS_BORN) + btrfs_handle_error(fs_info); +} + +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + char lvl[4]; + struct va_format vaf; + va_list args; + const char *type = logtypes[4]; + int kern_level; + + va_start(args, fmt); + + kern_level = printk_get_level(fmt); + if (kern_level) { + size_t size = printk_skip_level(fmt) - fmt; + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + fmt += size; + type = logtypes[kern_level - '0']; + } else + *lvl = '\0'; + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + + va_end(args); +} + +#else + +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno) +{ + /* + * Report first abort since mount + */ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, + &root->fs_info->fs_state)) { + WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", + errno); + } + trans->aborted = errno; + /* Nothing used. The other threads that have joined this + * transaction may be able to continue. */ + if (!trans->blocks_used) { + const char *errstr; + + errstr = btrfs_decode_error(errno); + btrfs_warn(root->fs_info, + "%s:%d: Aborting unused transaction(%s).", + function, line, errstr); + return; + } + ACCESS_ONCE(trans->transaction->aborted) = errno; + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_blocked_wait); + __btrfs_std_error(root->fs_info, function, line, errno, NULL); +} +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, + * issues an alert, and either panics or BUGs, depending on mount options. + */ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = "<unknown>"; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} - ret = close_ctree(root); - sb->s_fs_info = NULL; +static void btrfs_put_super(struct super_block *sb) +{ + (void)close_ctree(btrfs_sb(sb)->tree_root); + /* FIXME: need to fix VFS to return error? */ + /* AV: return it _where_? ->put_super() can be triggered by any number + * of async events, up to and including delivery of SIGKILL to the + * last process that kept it busy. Or segfault in the aforementioned + * process... Whom would you report that to? + */ } enum { Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_err, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, + Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, + Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, + Opt_datasum, Opt_treelog, Opt_noinode_cache, + Opt_err, }; static match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%d"}, + {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, + {Opt_datasum, "datasum"}, {Opt_nodatacow, "nodatacow"}, + {Opt_datacow, "datacow"}, {Opt_nobarrier, "nobarrier"}, + {Opt_barrier, "barrier"}, {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, + {Opt_treelog, "treelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_space_cache, "space_cache"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, + {Opt_subvolrootid, "subvolrootid=%d"}, + {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, + {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, + {Opt_fatal_errors, "fatal_errors=%s"}, + {Opt_commit_interval, "commit=%d"}, {Opt_err, NULL}, }; /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. + * XXX JDM: This needs to be cleaned up for remount. */ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; + char *compress_type; + bool compress_force = false; + bool compress = false; + + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -128,11 +427,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) token = match_token(p, tokens, args); switch (token) { case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_info(root->fs_info, "allowing degraded mounts"); btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: case Opt_subvolid: + case Opt_subvolrootid: case Opt_device: /* * These are parsed by btrfs_parse_early_options @@ -140,51 +440,112 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); + btrfs_set_and_info(root, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(root, NODATASUM)) { + if (btrfs_test_opt(root, NODATACOW)) + btrfs_info(root->fs_info, "setting datasum, datacow enabled"); + else + btrfs_info(root->fs_info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, NODATACOW)) { + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + btrfs_info(root->fs_info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(root->fs_info, "setting nodatacow"); + } + } + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; - case Opt_compress: - printk(KERN_INFO "btrfs: use compression\n"); - btrfs_set_opt(info->mount_opt, COMPRESS); + case Opt_datacow: + btrfs_clear_and_info(root, NODATACOW, + "setting datacow"); break; case Opt_compress_force: - printk(KERN_INFO "btrfs: forcing compression\n"); - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, COMPRESS); + case Opt_compress_force_type: + compress_force = true; + /* Fallthrough */ + case Opt_compress: + case Opt_compress_type: + compress = true; + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_LZO); + } else if (strncmp(args[0].from, "no", 2) == 0) { + compress_type = "no"; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + compress_force = false; + } else { + ret = -EINVAL; + goto out; + } + + if (compress_force) { + btrfs_set_and_info(root, FORCE_COMPRESS, + "force %s compression", + compress_type); + } else if (compress) { + if (!btrfs_test_opt(root, COMPRESS)) + btrfs_info(root->fs_info, + "btrfs: use %s compression", + compress_type); + } break; case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_and_info(root, SSD, + "use ssd allocation scheme"); break; case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); + btrfs_set_and_info(root, SSD_SPREAD, + "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); break; case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_set_and_info(root, NOSSD, + "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_barrier: + btrfs_clear_and_info(root, NOBARRIER, + "turning on barriers"); break; case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); + btrfs_set_and_info(root, NOBARRIER, + "turning off barriers"); break; case Opt_thread_pool: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg > 0) { info->thread_pool_size = intarg; - printk(KERN_INFO "btrfs: thread pool %d\n", - info->thread_pool_size); + } else { + ret = -EINVAL; + goto out; } break; case Opt_max_inline: @@ -194,50 +555,196 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); if (info->max_inline) { - info->max_inline = max_t(u64, + info->max_inline = min_t(u64, info->max_inline, root->sectorsize); } - printk(KERN_INFO "btrfs: max_inline at %llu\n", - (unsigned long long)info->max_inline); + btrfs_info(root->fs_info, "max_inline at %llu", + info->max_inline); + } else { + ret = -ENOMEM; + goto out; } break; case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { + mutex_lock(&info->chunk_mutex); info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", - (unsigned long long)info->alloc_start); + btrfs_info(root->fs_info, "allocations start at %llu", + info->alloc_start); + } else { + ret = -ENOMEM; + goto out; } break; + case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + root->fs_info->sb->s_flags |= MS_POSIXACL; + break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); + btrfs_set_and_info(root, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(root, NOTREELOG, + "enabling tree log"); break; case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + btrfs_set_and_info(root, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(root, FLUSHONCOMMIT, + "turning off flush-on-commit"); break; case Opt_ratio: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", + btrfs_info(root->fs_info, "metadata ratio %d", info->metadata_ratio); + } else { + ret = -EINVAL; + goto out; } break; case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); + btrfs_set_and_info(root, DISCARD, + "turning on discard"); + break; + case Opt_nodiscard: + btrfs_clear_and_info(root, DISCARD, + "turning off discard"); + break; + case Opt_space_cache: + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_no_space_cache: + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + break; + case Opt_inode_cache: + btrfs_set_and_info(root, CHANGE_INODE_CACHE, + "enabling inode map caching"); + break; + case Opt_noinode_cache: + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + break; + case Opt_clear_cache: + btrfs_set_and_info(root, CLEAR_CACHE, + "force clearing of disk cache"); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + btrfs_set_and_info(root, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(root, AUTO_DEFRAG, + "disabling auto defrag"); + break; + case Opt_recovery: + btrfs_info(root->fs_info, "enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + btrfs_info(root->fs_info, + "enabling check integrity including extent data"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + btrfs_info(root->fs_info, "enabling check integrity"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { + info->check_integrity_print_mask = intarg; + btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x", + info->check_integrity_print_mask); + } else { + ret = -EINVAL; + goto out; + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + btrfs_err(root->fs_info, + "support for check_integrity* not compiled in!"); + ret = -EINVAL; + goto out; +#endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else if (strcmp(args[0].from, "bug") == 0) + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else { + ret = -EINVAL; + goto out; + } + break; + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); + if (ret < 0) { + btrfs_err(root->fs_info, "invalid commit interval"); + ret = -EINVAL; + goto out; + } + if (intarg > 0) { + if (intarg > 300) { + btrfs_warn(root->fs_info, "excessive commit interval %d", + intarg); + } + info->commit_interval = intarg; + } else { + btrfs_info(root->fs_info, "using default commit interval %ds", + BTRFS_DEFAULT_COMMIT_INTERVAL); + info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } break; case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); + btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -245,6 +752,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + btrfs_info(root->fs_info, "disk space caching is enabled"); kfree(orig); return ret; } @@ -260,12 +769,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *p; + char *device_name, *opts, *orig, *p; + char *num = NULL; int error = 0; - int intarg; if (!options) - goto out; + return 0; /* * strsep changes the string, duplicate it because parse_options @@ -274,6 +783,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; + orig = opts; while ((p = strsep(&opts, ",")) != NULL) { int token; @@ -283,51 +793,59 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); + if (!*subvol_name) { + error = -ENOMEM; + goto out; + } break; case Opt_subvolid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { + num = match_strdup(&args[0]); + if (num) { + *subvol_objectid = memparse(num, NULL); + kfree(num); /* we want the original fs_tree */ - if (!intarg) + if (!*subvol_objectid) *subvol_objectid = BTRFS_FS_TREE_OBJECTID; - else - *subvol_objectid = intarg; + } else { + error = -EINVAL; + goto out; } break; + case Opt_subvolrootid: + printk(KERN_WARNING + "BTRFS: 'subvolrootid' mount option is deprecated and has " + "no effect\n"); + break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) - goto out_free_opts; + goto out; break; default: break; } } - out_free_opts: - kfree(opts); - out: - /* - * If no subvolume name is specified we use the default one. Allocate - * a copy of the string "." here so that code later in the - * mount path doesn't care if it's the default volume or another one. - */ - if (!*subvol_name) { - *subvol_name = kstrdup(".", GFP_KERNEL); - if (!*subvol_name) - return -ENOMEM; - } +out: + kfree(orig); return error; } static struct dentry *get_default_root(struct super_block *sb, u64 subvol_objectid) { - struct btrfs_root *root = sb->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -358,8 +876,12 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (IS_ERR(di)) { + btrfs_free_path(path); + return ERR_CAST(di); + } if (!di) { /* * Ok the default dir item isn't there. This is weird since @@ -368,7 +890,7 @@ static struct dentry *get_default_root(struct super_block *sb, */ btrfs_free_path(path); dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = root->fs_info->fs_root; + new_root = fs_info->fs_root; goto setup_root; } @@ -376,12 +898,9 @@ static struct dentry *get_default_root(struct super_block *sb, btrfs_free_path(path); find_root: - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + new_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(new_root)) - return ERR_PTR(PTR_ERR(new_root)); - - if (btrfs_root_refs(&new_root->root_item) == 0) - return ERR_PTR(-ENOENT); + return ERR_CAST(new_root); dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: @@ -390,8 +909,8 @@ setup_root: location.offset = 0; inode = btrfs_iget(sb, &location, new_root, &new); - if (!inode) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* * If we're just mounting the root most subvol put the inode and return @@ -403,28 +922,12 @@ setup_root: return dget(sb->s_root); } - if (new) { - const struct qstr name = { .name = "/", .len = 1 }; - - /* - * New inode, we need to make the dentry a sibling of s_root so - * everything gets cleaned up properly on unmount. - */ - dentry = d_alloc(sb->s_root, &name); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - d_splice_alias(inode, dentry); - } else { - /* - * We found the inode in cache, just find a dentry for it and - * put the reference to the inode we just got. - */ - dentry = d_find_alias(inode); - iput(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); } - return dentry; } @@ -433,80 +936,82 @@ static int btrfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root_dentry; - struct btrfs_super_block *disk_super; - struct btrfs_root *tree_root; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - - tree_root = open_ctree(sb, fs_devices, (char *)data); - - if (IS_ERR(tree_root)) { - printk("btrfs: open_ctree failed\n"); - return PTR_ERR(tree_root); + sb->s_flags |= MS_I_VERSION; + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { + printk(KERN_ERR "BTRFS: open_ctree failed\n"); + return err; } - sb->s_fs_info = tree_root; - disk_super = &tree_root->fs_info->super_copy; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; } - root_dentry = d_alloc_root(inode); - if (!root_dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) { err = -ENOMEM; goto fail_close; } - sb->s_root = root_dentry; - save_mount_options(sb, data); + cleancache_init_fs(sb); + sb->s_flags |= MS_ACTIVE; return 0; fail_close: - close_ctree(tree_root); + close_ctree(fs_info->tree_root); return err; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb); - int ret; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + + trace_btrfs_sync_fs(wait); if (!wait) { - filemap_flush(root->fs_info->btree_inode->i_mapping); + filemap_flush(fs_info->btree_inode->i_mapping); return 0; } - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_roots(fs_info, -1); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); - return ret; + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); } -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); - struct btrfs_fs_info *info = root->fs_info; + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = info->tree_root; + char *compress_type; if (btrfs_test_opt(root, DEGRADED)) seq_puts(seq, ",degraded"); @@ -517,16 +1022,22 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", - (unsigned long long)info->max_inline); + seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", - (unsigned long long)info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) - seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, COMPRESS)) { + if (info->compress_type == BTRFS_COMPRESS_ZLIB) + compress_type = "zlib"; + else + compress_type = "lzo"; + if (btrfs_test_opt(root, FORCE_COMPRESS)) + seq_printf(seq, ",compress-force=%s", compress_type); + else + seq_printf(seq, ",compress=%s", compress_type); + } if (btrfs_test_opt(root, NOSSD)) seq_puts(seq, ",nossd"); if (btrfs_test_opt(root, SSD_SPREAD)) @@ -541,15 +1052,174 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); + if (btrfs_test_opt(root, SPACE_CACHE)) + seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",nospace_cache"); + if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); + if (btrfs_test_opt(root, CLEAR_CACHE)) + seq_puts(seq, ",clear_cache"); + if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, RECOVERY)) + seq_puts(seq, ",recovery"); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + seq_puts(seq, ",check_int_data"); + else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + seq_puts(seq, ",check_int"); + if (info->check_integrity_print_mask) + seq_printf(seq, ",check_int_print_mask=%d", + info->check_integrity_print_mask); +#endif + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%d", + info->metadata_ratio); + if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%d", info->commit_interval); return 0; } static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_devices *test_fs_devices = data; - struct btrfs_root *root = btrfs_sb(s); + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; +} - return root->fs_info->fs_devices == test_fs_devices; +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned len = strlen(args) + 2 + 1; + char *src, *dst, *buf; + + /* + * We need the same args as before, but with this substitution: + * s!subvol=[^,]+!subvolid=0! + * + * Since the replacement string is up to 2 bytes longer than the + * original, allocate strlen(args) + 2 + 1 bytes. + */ + + src = strstr(args, "subvol="); + /* This shouldn't happen, but just in case.. */ + if (!src) + return NULL; + + buf = dst = kmalloc(len, GFP_NOFS); + if (!buf) + return NULL; + + /* + * If the subvol= arg is not at the start of the string, + * copy whatever precedes it into buf. + */ + if (src != args) { + *src++ = '\0'; + strcpy(buf, args); + dst += strlen(args); + } + + strcpy(dst, "subvolid=0"); + dst += strlen("subvolid=0"); + + /* + * If there is a "," after the original subvol=... string, + * copy that suffix into our buffer. Otherwise, we're done. + */ + src = strchr(src, ','); + if (src) + strcpy(dst, src); + + return buf; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct dentry *root; + struct vfsmount *mnt; + char *newargs; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) { + kfree(newargs); + return ERR_CAST(mnt); + } + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); + return ERR_PTR(r); + } + } + } + + kfree(newargs); + + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + root = mount_subtree(mnt, subvol_name); + + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); + printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + } + + return root; } /* @@ -558,18 +1228,18 @@ static int btrfs_test_super(struct super_block *s, void *data) * Note: This is based on get_sb_bdev from fs/super.c with a few additions * for multiple device setup. Make sure to keep it in sync. */ -static int btrfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) { struct block_device *bdev = NULL; struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - int found = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; @@ -577,16 +1247,43 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &fs_devices); - if (error) - return error; + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } - error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - goto error_free_subvol_name; + return ERR_PTR(error); + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->fs_devices = fs_devices; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } error = btrfs_open_devices(fs_devices, mode, fs_type); if (error) - goto error_free_subvol_name; + goto error_fs_info; if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { error = -EACCES; @@ -594,156 +1291,427 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); - if (IS_ERR(s)) - goto error_s; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + fs_info); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } if (s->s_root) { - if ((flags ^ s->s_flags) & MS_RDONLY) { - deactivate_locked_super(s); - error = -EBUSY; - goto error_close_devices; - } - - found = 1; btrfs_close_devices(fs_devices); + free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - goto error_free_subvol_name; - } - - btrfs_sb(s)->fs_info->bdev_holder = fs_type; - s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); + root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); + if (IS_ERR(root)) deactivate_locked_super(s); - goto error; - } - /* if they gave us a subvolume name bind mount into that */ - if (strcmp(subvol_name, ".")) { - struct dentry *new_root; - mutex_lock(&root->d_inode->i_mutex); - new_root = lookup_one_len(subvol_name, root, - strlen(subvol_name)); - mutex_unlock(&root->d_inode->i_mutex); - - if (IS_ERR(new_root)) { - deactivate_locked_super(s); - error = PTR_ERR(new_root); - dput(root); - goto error_close_devices; - } - if (!new_root->d_inode) { - dput(root); - dput(new_root); - deactivate_locked_super(s); - error = -ENXIO; - goto error_close_devices; - } - dput(root); - root = new_root; - } - mnt->mnt_sb = s; - mnt->mnt_root = root; + return root; - kfree(subvol_name); - return 0; - -error_s: - error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); -error_free_subvol_name: - kfree(subvol_name); -error: - return error; +error_fs_info: + free_fs_info(fs_info); + return ERR_PTR(error); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + int new_pool_size, int old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + btrfs_info(fs_info, "resize thread pool %d -> %d", + old_pool_size, new_pool_size); + + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, + new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, + new_pool_size); +} + +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & MS_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + /* + * We need cleanup all defragable inodes if the autodefragment is + * close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (fs_info->sb->s_flags & MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) { - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; + unsigned old_flags = sb->s_flags; + unsigned long old_opts = fs_info->mount_opt; + unsigned long old_compress_type = fs_info->compress_type; + u64 old_max_inline = fs_info->max_inline; + u64 old_alloc_start = fs_info->alloc_start; + int old_thread_pool_size = fs_info->thread_pool_size; + unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + sync_filesystem(sb); + btrfs_remount_prepare(fs_info); + ret = btrfs_parse_options(root, data); - if (ret) - return -EINVAL; + if (ret) { + ret = -EINVAL; + goto restore; + } + + btrfs_remount_begin(fs_info, old_opts, *flags); + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - return 0; + goto out; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); + sb->s_flags |= MS_RDONLY; - ret = btrfs_commit_super(root); - WARN_ON(ret); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); + + ret = btrfs_commit_super(root); + if (ret) + goto restore; } else { - if (root->fs_info->fs_devices->rw_devices == 0) - return -EACCES; + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + btrfs_err(fs_info, + "Remounting read-write after error is not allowed"); + ret = -EINVAL; + goto restore; + } + if (fs_info->fs_devices->rw_devices == 0) { + ret = -EACCES; + goto restore; + } - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) - return -EINVAL; + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + btrfs_warn(fs_info, + "too many missing devices, writeable remount is not allowed"); + ret = -EACCES; + goto restore; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + ret = -EINVAL; + goto restore; + } + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); - WARN_ON(ret); - - ret = btrfs_cleanup_fs_roots(root->fs_info); - WARN_ON(ret); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret) + goto restore; + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto restore; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto restore; + } + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); + goto restore; + } + } sb->s_flags &= ~MS_RDONLY; } +out: + wake_up_process(fs_info->transaction_kthread); + btrfs_remount_cleanup(fs_info, old_opts); + return 0; + +restore: + /* We've hit an error - don't reset MS_RDONLY */ + if (sb->s_flags & MS_RDONLY) + old_flags |= MS_RDONLY; + sb->s_flags = old_flags; + fs_info->mount_opt = old_opts; + fs_info->compress_type = old_compress_type; + fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); + fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); + fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); + return ret; +} + +/* Used to sort the devices by max_avail(descending sort) */ +static int btrfs_cmp_device_free_bytes(const void *dev_info1, + const void *dev_info2) +{ + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1, num_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->open_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) { + min_stripes = 2; + num_stripes = nr_devices; + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { + min_stripes = 2; + num_stripes = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { + min_stripes = 4; + num_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (num_stripes > nr_devices) + num_stripes = nr_devices; + + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * num_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - num_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; return 0; } static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct list_head *head = &root->fs_info->space_info; + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 data_used = 0; + u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)root->fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fsid; + int ret; + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_DUP| - BTRFS_BLOCK_GROUP_RAID10| - BTRFS_BLOCK_GROUP_RAID1)) { - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); } - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; + total_used += found->disk_used; } rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (data_used >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); + if (ret) { + mutex_unlock(&fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -757,13 +1725,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + kill_anon_super(sb); + free_fs_info(fs_info); +} + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", - .get_sb = btrfs_get_sb, - .kill_sb = kill_anon_super, + .mount = btrfs_mount, + .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("btrfs"); /* * used by btrfsctl to scan devices when no FS is mounted @@ -787,6 +1763,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = btrfs_scan_one_device(vol->name, FMODE_READ, &btrfs_fs_type, &fs_devices); break; + case BTRFS_IOC_DEVICES_READY: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + if (ret) + break; + ret = !(fs_devices->num_devices == fs_devices->total_devices); + break; } kfree(vol); @@ -795,28 +1778,67 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->cleaner_mutex); - return 0; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_unlock(&root->fs_info->cleaner_mutex); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + return 0; +} + +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; + struct list_head *head; + struct rcu_string *name; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + cur_devices = fs_info->fs_devices; + while (cur_devices) { + head = &cur_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (dev->missing) + continue; + if (!dev->name) + continue; + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; + } + cur_devices = cur_devices->seed; + } + + if (first_dev) { + rcu_read_lock(); + name = rcu_dereference(first_dev->name); + seq_escape(m, name->str, " \t\n\\"); + rcu_read_unlock(); + } else { + WARN_ON(1); + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); return 0; } static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, - .delete_inode = btrfs_delete_inode, + .evict_inode = btrfs_evict_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, .write_inode = btrfs_write_inode, - .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .statfs = btrfs_statfs, @@ -829,14 +1851,18 @@ static const struct file_operations btrfs_ctl_fops = { .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = btrfs_control_ioctl, .owner = THIS_MODULE, + .llseek = noop_llseek, }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); @@ -845,20 +1871,69 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "misc_deregister failed for control device"); + printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); +} + +static void btrfs_print_info(void) +{ + printk(KERN_INFO "Btrfs loaded" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_ASSERT + ", assert=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif + "\n"); +} + +static int btrfs_run_sanity_tests(void) +{ + int ret; + + ret = btrfs_init_test_fs(); + if (ret) + return ret; + + ret = btrfs_test_free_space_cache(); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(); + if (ret) + goto out; + ret = btrfs_test_extent_io(); + if (ret) + goto out; + ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); +out: + btrfs_destroy_test_fs(); + return ret; } static int __init init_btrfs_fs(void) { int err; - err = btrfs_init_sysfs(); + err = btrfs_hash_init(); if (err) return err; + btrfs_props_init(); + + err = btrfs_init_sysfs(); + if (err) + goto free_hash; + + btrfs_init_compress(); + err = btrfs_init_cachep(); if (err) - goto free_sysfs; + goto free_compress; err = extent_io_init(); if (err) @@ -868,43 +1943,89 @@ static int __init init_btrfs_fs(void) if (err) goto free_extent_io; - err = btrfs_interface_init(); + err = ordered_data_init(); if (err) goto free_extent_map; + err = btrfs_delayed_inode_init(); + if (err) + goto free_ordered_data; + + err = btrfs_auto_defrag_init(); + if (err) + goto free_delayed_inode; + + err = btrfs_delayed_ref_init(); + if (err) + goto free_auto_defrag; + + err = btrfs_prelim_ref_init(); + if (err) + goto free_prelim_ref; + + err = btrfs_interface_init(); + if (err) + goto free_delayed_ref; + + btrfs_init_lockdep(); + + btrfs_print_info(); + + err = btrfs_run_sanity_tests(); + if (err) + goto unregister_ioctl; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; - printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; unregister_ioctl: btrfs_interface_exit(); +free_prelim_ref: + btrfs_prelim_ref_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); +free_delayed_inode: + btrfs_delayed_inode_exit(); +free_ordered_data: + ordered_data_exit(); free_extent_map: extent_map_exit(); free_extent_io: extent_io_exit(); free_cachep: btrfs_destroy_cachep(); -free_sysfs: +free_compress: + btrfs_exit_compress(); btrfs_exit_sysfs(); +free_hash: + btrfs_hash_exit(); return err; } static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); + btrfs_auto_defrag_exit(); + btrfs_delayed_inode_exit(); + btrfs_prelim_ref_exit(); + ordered_data_exit(); extent_map_exit(); extent_io_exit(); btrfs_interface_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); - btrfs_zlib_exit(); + btrfs_exit_compress(); + btrfs_hash_exit(); } -module_init(init_btrfs_fs) +late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4ce16ef702a..78699364f53 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,249 +21,733 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/kobject.h> +#include <linux/bug.h> +#include <linux/genhd.h> +#include <linux/debugfs.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "sysfs.h" +#include "volumes.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); } -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); } -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +static int can_modify_feature(struct btrfs_feature_attr *fa) { + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); + return val; } -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); } -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + struct btrfs_trans_handle *trans; + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) + return ret; + + return count; } -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); + +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(mixed_backref), + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(big_metadata), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + NULL }; -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, }; -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); +BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group_cache *block_group; + int index = to_raid_kobj(kobj)->raid_type; + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + val += block_group->key.offset; + else + val += btrfs_block_group_used(&block_group->item); + } + up_read(&sinfo->groups_sem); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static struct attribute *raid_attributes[] = { + BTRFS_RAID_ATTR_PTR(total_bytes), + BTRFS_RAID_ATTR_PTR(used_bytes), + NULL }; -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) +static void release_raid_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); + kfree(to_raid_kobj(kobj)); +} - return a->show ? a->show(fs, buf) : 0; +struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_attrs = raid_attributes, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) + +static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); + return snprintf(buf, PAGE_SIZE, "%lld\n", val); } -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); + +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(flags), + BTRFS_ATTR_PTR(total_bytes), + BTRFS_ATTR_PTR(bytes_used), + BTRFS_ATTR_PTR(bytes_pinned), + BTRFS_ATTR_PTR(bytes_reserved), + BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(disk_used), + BTRFS_ATTR_PTR(disk_total), + BTRFS_ATTR_PTR(total_bytes_pinned), + NULL, +}; + +static void space_info_release(struct kobject *kobj) { - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); + struct btrfs_space_info *sinfo = to_space_info(kobj); + percpu_counter_destroy(&sinfo->total_bytes_pinned); + kfree(sinfo); +} + +struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_attrs = space_info_attrs, +}; - return a->store ? a->store(fs, buf, len) : 0; +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(global_rsv_reserved), + BTRFS_ATTR_PTR(global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); } -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) { - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->fs_root; + int ret; + + if (len >= BTRFS_LABEL_SIZE) + return -EINVAL; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + strcpy(fs_info->super_copy->label, buf); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + + if (!ret) + return len; + + return ret; +} +BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); - return a->show ? a->show(root, buf) : 0; +static ssize_t btrfs_no_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + return -EPERM; } -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) { - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -static void btrfs_super_release(struct kobject *kobj) +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) { - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -static void btrfs_root_release(struct kobject *kobj) +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) { - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, +static struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), + NULL, }; -static struct kobj_type btrfs_root_ktype = { - .default_attrs = btrfs_root_attrs, - .sysfs_ops = &btrfs_root_attr_ops, - .release = btrfs_root_release, +static void btrfs_release_super_kobj(struct kobject *kobj) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + complete(&fs_info->kobj_unregister); +} + +static struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_super_kobj, + .default_attrs = btrfs_attrs, }; -static struct kobj_type btrfs_super_ktype = { - .default_attrs = btrfs_super_attrs, - .sysfs_ops = &btrfs_super_attr_ops, - .release = btrfs_super_release, +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_info, super_kobj); +} + +#define NUM_FEATURE_BITS 64 +static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; +static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; + +static u64 supported_feature_masks[3] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, }; -/* /sys/fs/btrfs/ entry */ -static struct kset *btrfs_kset; +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->super_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->super_kobj, + &agroup); + } + + } + return 0; +} -int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { - int error; - char *name; - char c; - int len = strlen(fs->sb->s_id) + 1; + kobject_del(&fs_info->super_kobj); + kobject_put(&fs_info->super_kobj); + wait_for_completion(&fs_info->kobj_unregister); +} + +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + kobject_del(fs_info->device_dir_kobj); + kobject_put(fs_info->device_dir_kobj); + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); + __btrfs_sysfs_remove_one(fs_info); +} + +const char * const btrfs_feature_set_names[3] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; int i; + char *str; - name = kmalloc(len, GFP_NOFS); - if (!name) { - error = -ENOMEM; - goto fail; + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += snprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); } - for (i = 0; i < len; i++) { - c = fs->sb->s_id[i]; - if (c == '/' || c == '\\') - c = '!'; - name[i] = c; + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != + ARRAY_SIZE(btrfs_feature_attrs)); + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != + ARRAY_SIZE(btrfs_feature_attrs[0])); + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; } - name[len] = '\0'; - fs->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, - NULL, "%s", name); - kfree(name); - if (error) - goto fail; + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, 13, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *dev; + + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", + &fs_info->super_kobj); + + if (!fs_info->device_dir_kobj) + return -ENOMEM; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!dev->bdev) + continue; + + if (one_device && one_device != dev) + continue; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + error = sysfs_create_link(fs_info->device_dir_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } -fail: - printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); return error; } -int btrfs_sysfs_add_root(struct btrfs_root *root) +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +/* /sys/kernel/debug/btrfs */ +static struct dentry *btrfs_debugfs_root_dentry; + +/* Debugging tunables and exported data */ +u64 btrfs_debugfs_test; + +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; - error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, - &root->fs_info->super_kobj, - "%s", root->name); + init_completion(&fs_info->kobj_unregister); + fs_info->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, + "%pU", fs_info->fsid); if (error) - goto fail; + return error; - return 0; + error = sysfs_create_group(&fs_info->super_kobj, + &btrfs_feature_attr_group); + if (error) { + __btrfs_sysfs_remove_one(fs_info); + return error; + } + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = btrfs_kobj_add_device(fs_info, NULL); + if (error) + goto failure; -fail: - printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + fs_info->space_info_kobj = kobject_create_and_add("allocation", + &fs_info->super_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_one(fs_info); return error; } -void btrfs_sysfs_del_root(struct btrfs_root *root) +static int btrfs_init_debugfs(void) { - kobject_put(&root->root_kobj); - wait_for_completion(&root->kobj_unregister); -} +#ifdef CONFIG_DEBUG_FS + btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); + if (!btrfs_debugfs_root_dentry) + return -ENOMEM; -void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) -{ - kobject_put(&fs->super_kobj); - wait_for_completion(&fs->kobj_unregister); + debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry, + &btrfs_debugfs_test); +#endif + return 0; } int btrfs_init_sysfs(void) { + int ret; + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; - return 0; + + ret = btrfs_init_debugfs(); + if (ret) + return ret; + + init_feature_attrs(); + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + + return ret; } void btrfs_exit_sysfs(void) { + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); + debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h new file mode 100644 index 00000000000..ac46df37504 --- /dev/null +++ b/fs/btrfs/sysfs.h @@ -0,0 +1,73 @@ +#ifndef _BTRFS_SYSFS_H_ +#define _BTRFS_SYSFS_H_ + +/* + * Data exported through sysfs + */ +extern u64 btrfs_debugfs_test; + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ +static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, _mode, _show, _store) +#define BTRFS_ATTR(_name, _mode, _show) \ + BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) + +#define BTRFS_RAID_ATTR(_name, _show) \ +static struct kobj_attribute btrfs_raid_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) +#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) + + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +/* convert from attribute */ +#define to_btrfs_feature_attr(a) \ + container_of(a, struct btrfs_feature_attr, kobj_attr) +#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) +#define attr_to_btrfs_feature_attr(a) \ + to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +extern const char * const btrfs_feature_set_names[3]; +extern struct kobj_type space_info_ktype; +extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +#endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c new file mode 100644 index 00000000000..9626252ee6b --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static struct vfsmount *test_mnt = NULL; + +static const struct super_operations btrfs_test_super_ops = { + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_test_destroy_inode, +}; + +static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, + NULL, BTRFS_TEST_MAGIC); +} + +static struct file_system_type test_type = { + .name = "btrfs_test_fs", + .mount = btrfs_test_mount, + .kill_sb = kill_anon_super, +}; + +struct inode *btrfs_new_test_inode(void) +{ + return new_inode(test_mnt->mnt_sb); +} + +int btrfs_init_test_fs(void) +{ + int ret; + + ret = register_filesystem(&test_type); + if (ret) { + printk(KERN_ERR "btrfs: cannot register test file system\n"); + return ret; + } + + test_mnt = kern_mount(&test_type); + if (IS_ERR(test_mnt)) { + printk(KERN_ERR "btrfs: cannot mount test file system\n"); + unregister_filesystem(&test_type); + return ret; + } + return 0; +} + +void btrfs_destroy_test_fs(void) +{ + kern_unmount(test_mnt); + unregister_filesystem(&test_type); +} + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h new file mode 100644 index 00000000000..fd395422448 --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TESTS +#define __BTRFS_TESTS + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) + +struct btrfs_root; + +int btrfs_test_free_space_cache(void); +int btrfs_test_extent_buffer_operations(void); +int btrfs_test_extent_io(void); +int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); +int btrfs_init_test_fs(void); +void btrfs_destroy_test_fs(void); +struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); +#else +static inline int btrfs_test_free_space_cache(void) +{ + return 0; +} +static inline int btrfs_test_extent_buffer_operations(void) +{ + return 0; +} +static inline int btrfs_init_test_fs(void) +{ + return 0; +} +static inline void btrfs_destroy_test_fs(void) +{ +} +static inline int btrfs_test_extent_io(void) +{ + return 0; +} +static inline int btrfs_test_inodes(void) +{ + return 0; +} +static inline int btrfs_test_qgroups(void) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c new file mode 100644 index 00000000000..cc286ce97d1 --- /dev/null +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../extent_io.h" +#include "../disk-io.h" + +static int test_btrfs_split_item(void) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + struct btrfs_item *item; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; + char *split3 = "mary"; + char *split4 = " had a little"; + char buf[32]; + struct btrfs_key key; + u32 value_len = strlen(value); + int ret = 0; + + test_msg("Running btrfs_split_item tests\n"); + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Could not allocate root\n"); + return PTR_ERR(root); + } + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Could not allocate path\n"); + kfree(root); + return -ENOMEM; + } + + path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + if (!eb) { + test_msg("Could not allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + path->slots[0] = 0; + + key.objectid = 0; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = 0; + + setup_items_for_insert(root, path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + item = btrfs_item_nr(0); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + + key.offset = 3; + + /* + * Passing NULL trans here should be safe because we have plenty of + * space in this leaf to split the item without having to split the + * leaf. + */ + ret = btrfs_split_item(NULL, root, path, &key, 17); + if (ret) { + test_msg("Split item failed %d\n", ret); + goto out; + } + + /* + * Read the first slot, it should have the original key and contain only + * 'mary had a little' + */ + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split1)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split1)); + if (memcmp(buf, split1, strlen(split1))) { + test_msg("Data in the buffer doesn't match what it should " + "in the first split have='%.*s' want '%s'\n", + (int)strlen(split1), buf, split1); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the second split\n"); + ret = -EINVAL; + goto out; + } + + key.offset = 1; + /* Do it again so we test memmoving the other items in the leaf */ + ret = btrfs_split_item(NULL, root, path, &key, 4); + if (ret) { + test_msg("Second split item failed %d\n", ret); + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 0); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 0) { + test_msg("Invalid key at slot 0\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(0); + if (btrfs_item_size(eb, item) != strlen(split3)) { + test_msg("Invalid len in the first split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), + strlen(split3)); + if (memcmp(buf, split3, strlen(split3))) { + test_msg("Data in the buffer doesn't match what it should " + "in the third split"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 1); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 1) { + test_msg("Invalid key at slot 1\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(1); + if (btrfs_item_size(eb, item) != strlen(split4)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), + strlen(split4)); + if (memcmp(buf, split4, strlen(split4))) { + test_msg("Data in the buffer doesn't match what it should " + "in the fourth split\n"); + ret = -EINVAL; + goto out; + } + + btrfs_item_key_to_cpu(eb, &key, 2); + if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset != 3) { + test_msg("Invalid key at slot 2\n"); + ret = -EINVAL; + goto out; + } + + item = btrfs_item_nr(2); + if (btrfs_item_size(eb, item) != strlen(split2)) { + test_msg("Invalid len in the second split\n"); + ret = -EINVAL; + goto out; + } + + read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), + strlen(split2)); + if (memcmp(buf, split2, strlen(split2))) { + test_msg("Data in the buffer doesn't match what it should " + "in the last chunk\n"); + ret = -EINVAL; + goto out; + } +out: + btrfs_free_path(path); + kfree(root); + return ret; +} + +int btrfs_test_extent_buffer_operations(void) +{ + test_msg("Running extent buffer operation tests"); + return test_btrfs_split_item(); +} diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c new file mode 100644 index 00000000000..7e99c2f98dd --- /dev/null +++ b/fs/btrfs/tests/extent-io-tests.c @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/pagemap.h> +#include <linux/sched.h> +#include "btrfs-tests.h" +#include "../extent_io.h" + +#define PROCESS_UNLOCK (1 << 0) +#define PROCESS_RELEASE (1 << 1) +#define PROCESS_TEST_LOCKED (1 << 2) + +static noinline int process_page_range(struct inode *inode, u64 start, u64 end, + unsigned long flags) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int count = 0; + int loops = 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (flags & PROCESS_TEST_LOCKED && + !PageLocked(pages[i])) + count++; + if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) + unlock_page(pages[i]); + page_cache_release(pages[i]); + if (flags & PROCESS_RELEASE) + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + loops++; + if (loops > 100000) { + printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + break; + } + } + return count; +} + +static int test_find_delalloc(void) +{ + struct inode *inode; + struct extent_io_tree tmp; + struct page *page; + struct page *locked_page = NULL; + unsigned long index = 0; + u64 total_dirty = 256 * 1024 * 1024; + u64 max_bytes = 128 * 1024 * 1024; + u64 start, end, test_start; + u64 found; + int ret = -EINVAL; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Failed to allocate test inode\n"); + return -ENOMEM; + } + + extent_io_tree_init(&tmp, &inode->i_data); + + /* + * First go through and create and mark all of our pages dirty, we pin + * everything to make sure our pages don't get evicted and screw up our + * test. + */ + for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + test_msg("Failed to allocate test page\n"); + ret = -ENOMEM; + goto out; + } + SetPageDirty(page); + if (index) { + unlock_page(page); + } else { + page_cache_get(page); + locked_page = page; + } + } + + /* Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + start = 0; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Should have found at least one delalloc\n"); + goto out_bits; + } + if (start != 0 || end != 4095) { + test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", + start, end); + goto out_bits; + } + unlock_extent(&tmp, start, end); + unlock_page(locked_page); + page_cache_release(locked_page); + + /* + * Test this scenario + * + * |--- delalloc ---| + * |--- search ---| + */ + test_start = 64 * 1024 * 1024; + locked_page = find_lock_page(inode->i_mapping, + test_start >> PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Couldn't find the locked page\n"); + goto out_bits; + } + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Couldn't find delalloc in our range\n"); + goto out_bits; + } + if (start != test_start || end != max_bytes - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu, end " + "%Lu\n", test_start, max_bytes - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("There were unlocked pages in the range\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + /* locked_page was unlocked above */ + page_cache_release(locked_page); + + /* + * Test this scenario + * |--- delalloc ---| + * |--- search ---| + */ + test_start = max_bytes + 4096; + locked_page = find_lock_page(inode->i_mapping, test_start >> + PAGE_CACHE_SHIFT); + if (!locked_page) { + test_msg("Could'nt find the locked page\n"); + goto out_bits; + } + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (found) { + test_msg("Found range when we shouldn't have\n"); + goto out_bits; + } + if (end != (u64)-1) { + test_msg("Did not return the proper end offset\n"); + goto out_bits; + } + + /* + * Test this scenario + * [------- delalloc -------| + * [max_bytes]|-- search--| + * + * We are re-using our test_start from above since it works out well. + */ + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + start = test_start; + end = 0; + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start || end != total_dirty - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, total_dirty - 1, start, end); + goto out_bits; + } + if (process_page_range(inode, start, end, + PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + unlock_extent(&tmp, start, end); + + /* + * Now to test where we run into a page that is no longer dirty in the + * range we want to find. + */ + page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) + >> PAGE_CACHE_SHIFT); + if (!page) { + test_msg("Couldn't find our page\n"); + goto out_bits; + } + ClearPageDirty(page); + page_cache_release(page); + + /* We unlocked it in the previous test */ + lock_page(locked_page); + start = test_start; + end = 0; + /* + * Currently if we fail to find dirty pages in the delalloc range we + * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * this changes at any point in the future we will need to fix this + * tests expected behavior. + */ + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end, max_bytes); + if (!found) { + test_msg("Didn't find our range\n"); + goto out_bits; + } + if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_start, test_start + PAGE_CACHE_SIZE - 1, start, + end); + goto out_bits; + } + if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | + PROCESS_UNLOCK)) { + test_msg("Pages in range were not all locked\n"); + goto out_bits; + } + ret = 0; +out_bits: + clear_extent_bits(&tmp, 0, total_dirty - 1, + (unsigned long)-1, GFP_NOFS); +out: + if (locked_page) + page_cache_release(locked_page); + process_page_range(inode, 0, total_dirty - 1, + PROCESS_UNLOCK | PROCESS_RELEASE); + iput(inode); + return ret; +} + +int btrfs_test_extent_io(void) +{ + test_msg("Running find delalloc tests\n"); + return test_find_delalloc(); +} diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c new file mode 100644 index 00000000000..c8d9ddf84c6 --- /dev/null +++ b/fs/btrfs/tests/free-space-tests.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../free-space-cache.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + test_msg("Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing extent %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + test_msg("Error removing middle piece %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Still have space at the front\n"); + return -1; + } + + if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + test_msg("Still have space in the middle\n"); + return -1; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("Running bitmap only tests\n"); + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap full range %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Left some space in bitmap\n"); + return -1; + } + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = test_add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + test_msg("Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + test_msg("Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + test_msg("Left over pieces after removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + test_msg("Problem removing overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + test_msg("Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + test_msg("Failed to free our space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + test_msg("Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap and extent overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +int btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + int ret; + + test_msg("Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + test_msg("Couldn't run the tests\n"); + return 0; + } + + ret = test_extents(cache); + if (ret) + goto out; + ret = test_bitmaps(cache); + if (ret) + goto out; + ret = test_bitmaps_and_extents(cache); + if (ret) + goto out; +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + test_msg("Free space cache tests finished\n"); + return ret; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c new file mode 100644 index 00000000000..3ae0f5b8bb8 --- /dev/null +++ b/fs/btrfs/tests/inode-tests.c @@ -0,0 +1,928 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../btrfs_inode.h" +#include "../disk-io.h" +#include "../extent_io.h" +#include "../volumes.h" + +static void insert_extent(struct btrfs_root *root, u64 start, u64 len, + u64 ram_bytes, u64 offset, u64 disk_bytenr, + u64 disk_len, u32 type, u8 compression, int slot) +{ + struct btrfs_path path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = sizeof(struct btrfs_file_extent_item); + + if (type == BTRFS_FILE_EXTENT_INLINE) + value_len += len; + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = slot; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, 1); + btrfs_set_file_extent_type(leaf, fi, type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len); + btrfs_set_file_extent_offset(leaf, fi, offset); + btrfs_set_file_extent_num_bytes(leaf, fi, len); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); +} + +static void insert_inode_item_key(struct btrfs_root *root) +{ + struct btrfs_path path; + struct extent_buffer *leaf = root->node; + struct btrfs_key key; + u32 value_len = 0; + + memset(&path, 0, sizeof(path)); + + path.nodes[0] = leaf; + path.slots[0] = 0; + + key.objectid = BTRFS_INODE_ITEM_KEY; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + setup_items_for_insert(root, &path, &key, &value_len, value_len, + value_len + sizeof(struct btrfs_item), 1); +} + +/* + * Build the most complicated map of extents the earth has ever seen. We want + * this so we can test all of the corner cases of btrfs_get_extent. Here is a + * diagram of how the extents will look though this may not be possible we still + * want to make sure everything acts normally (the last number is not inclusive) + * + * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] + * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * + * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] + * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * + * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] + * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * + * [81920-86016] + * [ regular ] + */ +static void setup_file_extents(struct btrfs_root *root) +{ + int slot = 0; + u64 disk_bytenr = 1 * 1024 * 1024; + u64 offset = 0; + + /* First we want a hole */ + insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 5; + + /* + * Now we want an inline extent, I don't think this is possible but hey + * why not? Also keep in mind if we have an inline extent it counts as + * the whole first page. If we were to expand it we would have to cow + * and we wouldn't have an inline extent anymore. + */ + insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + slot); + slot++; + offset = 4096; + + /* Now another hole */ + insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, + slot); + slot++; + offset += 4; + + /* Now for a regular extent */ + insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + disk_bytenr += 4096; + offset += 4095; + + /* + * Now for 3 extents that were split from a hole punch so we test + * offsets properly. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, + 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now for a unwritten prealloc extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + + /* + * We want to jack up disk_bytenr a little more so the em stuff doesn't + * merge our records. + */ + disk_bytenr += 8192; + + /* + * Now for a partially written prealloc extent, basically the same as + * the hole punch example above. Ram_bytes never changes when you mark + * extents written btw. + */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + slot++; + offset += 8192; + disk_bytenr += 16384; + + /* Now a normal compressed extent */ + insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + /* No merges */ + disk_bytenr += 8192; + + /* Now a split compressed extent */ + insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 4096; + insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + slot++; + offset += 8192; + disk_bytenr += 8192; + + /* Now extents that have a hole but no hole extent */ + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); + slot++; + offset += 16384; + disk_bytenr += 4096; + insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, + BTRFS_FILE_EXTENT_REG, 0, slot); +} + +static unsigned long prealloc_only = 0; +static unsigned long compressed_only = 0; +static unsigned long vacancy_only = 0; + +static noinline int test_btrfs_get_extent(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + u64 orig_start; + u64 disk_bytenr; + u64 offset; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + /* + * We do this since btrfs_get_extent wants to assign em->bdev to + * root->fs_info->fs_devices->latest_bdev. + */ + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + /* + * We will just free a dummy node if it's ref count is 2 so we need an + * extra ref so our searches don't accidently release our page. + */ + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + ret = -EINVAL; + + /* First with no extents */ + BTRFS_I(inode)->root = root; + em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + if (IS_ERR(em)) { + em = NULL; + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + test_msg("Vacancy flag wasn't set properly\n"); + goto out; + } + free_extent_map(em); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + + /* + * All of the magic numbers are based on the mapping setup in + * setup_file_extents, so if you change anything there you need to + * update the comment and update the expected values below. + */ + setup_file_extents(root); + + em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 5) { + test_msg("Unexpected extent wanted start 0 len 5, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_INLINE) { + test_msg("Expected an inline, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4091) { + test_msg("Unexpected extent wanted start %llu len 1, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + /* + * We don't test anything else for inline since it doesn't get set + * unless we have a page for it to write into. Maybe we should change + * this? + */ + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4) { + test_msg("Unexpected extent wanted start %llu len 4, got start " + "%llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Regular extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4095) { + test_msg("Unexpected extent wanted start %llu len 4095, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are split extents */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + disk_bytenr += (em->start - orig_start); + if (em->block_start != disk_bytenr) { + test_msg("Wrong block start, want %llu, have %llu\n", + disk_bytenr, em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* The next 3 are a half written prealloc extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_HOLE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + orig_start, em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != prealloc_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + prealloc_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + em->orig_start); + goto out; + } + if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { + test_msg("Unexpected block start, wanted %llu, have %llu\n", + disk_bytenr + (em->start - em->orig_start), + em->block_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Now for the compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* Split compressed extent */ + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, em->orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + disk_bytenr = em->block_start; + orig_start = em->start; + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != disk_bytenr) { + test_msg("Block start does not match, want %llu got %llu\n", + disk_bytenr, em->block_start); + goto out; + } + if (em->start != offset || em->len != 8192) { + test_msg("Unexpected extent wanted start %llu len 8192, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != compressed_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + compressed_only, em->flags); + goto out; + } + if (em->orig_start != orig_start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", + em->start, orig_start); + goto out; + } + if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + test_msg("Unexpected compress type, wanted %d, got %d\n", + BTRFS_COMPRESS_ZLIB, em->compress_type); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + /* A hole between regular extents but no hole extent */ + em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole extent, got %llu\n", em->block_start); + goto out; + } + /* + * Currently we just return a length that we requested rather than the + * length of the actual hole, if this changes we'll have to change this + * test. + */ + if (em->start != offset || em->len != 12288) { + test_msg("Unexpected extent wanted start %llu len 12288, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Unexpected flags set, want %lu have %lu\n", + vacancy_only, em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + offset = em->start + em->len; + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != offset || em->len != 4096) { + test_msg("Unexpected extent wanted start %llu len 4096, got " + "start %llu len %llu\n", offset, em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + goto out; + } + if (em->orig_start != em->start) { + test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + em->orig_start); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +static int test_hole_first(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + struct extent_map *em = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + root->node = alloc_dummy_extent_buffer(0, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + goto out; + } + + extent_buffer_get(root->node); + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + BTRFS_I(inode)->root = root; + ret = -EINVAL; + + /* + * Need a blank inode item here just so we don't confuse + * btrfs_get_extent. + */ + insert_inode_item_key(root); + insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, + BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != EXTENT_MAP_HOLE) { + test_msg("Expected a hole, got %llu\n", em->block_start); + goto out; + } + if (em->start != 0 || em->len != 4096) { + test_msg("Unexpected extent wanted start 0 len 4096, got start " + "%llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != vacancy_only) { + test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + em->flags); + goto out; + } + free_extent_map(em); + + em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + if (IS_ERR(em)) { + test_msg("Got an error when we shouldn't have\n"); + goto out; + } + if (em->block_start != 4096) { + test_msg("Expected a real extent, got %llu\n", em->block_start); + goto out; + } + if (em->start != 4096 || em->len != 4096) { + test_msg("Unexpected extent wanted start 4096 len 4096, got " + "start %llu len %llu\n", em->start, em->len); + goto out; + } + if (em->flags != 0) { + test_msg("Unexpected flags set, wanted 0 got %lu\n", + em->flags); + goto out; + } + ret = 0; +out: + if (!IS_ERR(em)) + free_extent_map(em); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + +int btrfs_test_inodes(void) +{ + int ret; + + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); + set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); + set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + + test_msg("Running btrfs_get_extent tests\n"); + ret = test_btrfs_get_extent(); + if (ret) + return ret; + test_msg("Running hole first btrfs_get_extent test\n"); + return test_hole_first(); +} diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 00000000000..ec3dcb20235 --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to addt he root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2cb116099b9..5f379affdf2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -22,75 +22,214 @@ #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/uuid.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "locking.h" #include "tree-log.h" +#include "inode-map.h" +#include "volumes.h" +#include "dev-replace.h" +#include "qgroup.h" #define BTRFS_ROOT_TRANS_TAG 0 -static noinline void put_transaction(struct btrfs_transaction *transaction) +static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | + __TRANS_START), + [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN), + [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), + [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), +}; + +void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(transaction->use_count == 0); - transaction->use_count--; - if (transaction->use_count == 0) { - list_del_init(&transaction->list); - memset(transaction, 0, sizeof(*transaction)); + WARN_ON(atomic_read(&transaction->use_count) == 0); + if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); + WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + while (!list_empty(&transaction->pending_chunks)) { + struct extent_map *em; + + em = list_first_entry(&transaction->pending_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } -static noinline void switch_commit_root(struct btrfs_root *root) +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + } + up_write(&fs_info->commit_root_sem); +} + +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(&trans->num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; - cur_trans = root->fs_info->running_transaction; - if (!cur_trans) { - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, - GFP_NOFS); - BUG_ON(!cur_trans); - root->fs_info->generation++; - cur_trans->num_writers = 1; - cur_trans->num_joined = 0; - cur_trans->transid = root->fs_info->generation; - init_waitqueue_head(&cur_trans->writer_wait); - init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; - cur_trans->use_count = 1; - cur_trans->commit_done = 0; - cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - spin_lock_init(&cur_trans->delayed_refs.lock); - - INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping, - GFP_NOFS); - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = cur_trans; - spin_unlock(&root->fs_info->new_trans_lock); - } else { - cur_trans->num_writers++; - cur_trans->num_joined++; + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); +loop: + /* The file system has been taken offline. No new transactions. */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + spin_unlock(&fs_info->trans_lock); + return -EROFS; } + cur_trans = fs_info->running_transaction; + if (cur_trans) { + if (cur_trans->aborted) { + spin_unlock(&fs_info->trans_lock); + return cur_trans->aborted; + } + if (btrfs_blocked_trans_types[cur_trans->state] & type) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + extwriter_counter_inc(cur_trans, type); + spin_unlock(&fs_info->trans_lock); + return 0; + } + spin_unlock(&fs_info->trans_lock); + + /* + * If we are ATTACH, we just want to catch the current transaction, + * and commit it. If there is no transaction, just return ENOENT. + */ + if (type == TRANS_ATTACH) + return -ENOENT; + + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the checks above + */ + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + goto loop; + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + spin_unlock(&fs_info->trans_lock); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + return -EROFS; + } + + atomic_set(&cur_trans->num_writers, 1); + extwriter_counter_init(cur_trans, type); + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->state = TRANS_STATE_RUNNING; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.href_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when " + "creating a fresh transaction\n"); + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when " + "creating a fresh transaction\n"); + atomic64_set(&fs_info->tree_mod_seq, 0); + + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->ordered_operations); + INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); + list_add_tail(&cur_trans->list, &fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + fs_info->btree_inode->i_mapping); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; + cur_trans->aborted = 0; + spin_unlock(&fs_info->trans_lock); + return 0; } @@ -100,39 +239,93 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for IN_TRANS_SETUP usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); + + /* make sure readers find IN_TRANS_SETUP before + * they find our root->last_trans update + */ + smp_wmb(); + + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + spin_unlock(&root->fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root IN_TRANS_SETUP. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return 0; } + int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; - mutex_lock(&root->fs_info->trans_mutex); - if (root->last_trans == trans->transid) { - mutex_unlock(&root->fs_info->trans_mutex); + /* + * see record_root_in_trans for comments about IN_TRANS_SETUP usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; - } + mutex_lock(&root->fs_info->reloc_mutex); record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->trans_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + return 0; } +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_BLOCKED && + trans->state < TRANS_STATE_UNBLOCKED && + !trans->aborted); +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -141,198 +334,464 @@ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; - if (cur_trans && cur_trans->blocked) { - DEFINE_WAIT(wait); - cur_trans->use_count++; - while (1) { - prepare_to_wait(&root->fs_info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (!cur_trans->blocked) - break; - mutex_unlock(&root->fs_info->trans_mutex); - schedule(); - mutex_lock(&root->fs_info->trans_mutex); - } - finish_wait(&root->fs_info->transaction_wait, &wait); - put_transaction(cur_trans); + if (cur_trans && is_transaction_blocked(cur_trans)) { + atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + wait_event(root->fs_info->transaction_wait, + cur_trans->state >= TRANS_STATE_UNBLOCKED || + cur_trans->aborted); + btrfs_put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } } -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, -}; +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int type) + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) + return 1; + + return 0; +} + +static inline bool need_reserve_reloc_root(struct btrfs_root *root) { - struct btrfs_trans_handle *h = - kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!root->fs_info->reloc_ctl || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + root->reloc_root) + return false; + + return true; +} + +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + u64 num_bytes = 0; + u64 qgroup_reserved = 0; + bool reloc_reserved = false; int ret; - mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type & TRANS_EXTWRITERS); + h = current->journal_info; + h->use_count++; + WARN_ON(h->use_count > 2); + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + if (root->fs_info->quota_enabled && + is_fstree(root->root_key.objectid)) { + qgroup_reserved = num_items * root->leafsize; + ret = btrfs_qgroup_reserve(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); + } + + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + /* + * Do the reservation for the relocation root creation + */ + if (unlikely(need_reserve_reloc_root(root))) { + num_bytes += root->nodesize; + reloc_reserved = true; + } + + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes, flush); + if (ret) + goto reserve_fail; + } +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } + + /* + * If we are JOIN_NOLOCK we're already committing a transaction and + * waiting on this guy, so we don't need to do the sb_start_intwrite + * because we're already holding a ref. We need this because we could + * have raced in and did an fsync() on a file which can kick a commit + * and then we deadlock with somebody doing a freeze. + * + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it, so we needn't do sb_start_intwrite(). + */ + if (type & __TRANS_FREEZABLE) + sb_start_intwrite(root->fs_info->sb); + + if (may_wait_transaction(root, type)) wait_current_trans(root); - ret = join_transaction(root); - BUG_ON(ret); - h->transid = root->fs_info->running_transaction->transid; - h->transaction = root->fs_info->running_transaction; - h->blocks_reserved = num_blocks; + do { + ret = join_transaction(root, type); + if (ret == -EBUSY) { + wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } + } while (ret == -EBUSY); + + if (ret < 0) { + /* We must get the transaction if we are JOIN_NOLOCK. */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + goto join_fail; + } + + cur_trans = root->fs_info->running_transaction; + + h->transid = cur_trans->transid; + h->transaction = cur_trans; h->blocks_used = 0; - h->block_group = 0; - h->alloc_exclude_nr = 0; - h->alloc_exclude_start = 0; + h->bytes_reserved = 0; + h->root = root; h->delayed_ref_updates = 0; + h->use_count = 1; + h->adding_csums = 0; + h->block_rsv = NULL; + h->orig_rsv = NULL; + h->aborted = 0; + h->qgroup_reserved = 0; + h->delayed_ref_elem.seq = 0; + h->type = type; + h->allocating_chunk = false; + h->reloc_reserved = false; + h->sync = false; + INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->new_bgs); - if (!current->journal_info && type != TRANS_USERSPACE) + smp_mb(); + if (cur_trans->state >= TRANS_STATE_BLOCKED && + may_wait_transaction(root, type)) { current->journal_info = h; + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + h->transid, num_bytes, 1); + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; + h->reloc_reserved = reloc_reserved; + } + h->qgroup_reserved = qgroup_reserved; + +got_it: + btrfs_record_root_in_trans(h, root); - root->fs_info->running_transaction->use_count++; - record_root_in_trans(h, root); - mutex_unlock(&root->fs_info->trans_mutex); + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; return h; + +join_fail: + if (type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks) + int num_items) { - return start_transaction(root, num_blocks, TRANS_START); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL); } -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks) + +struct btrfs_trans_handle *btrfs_start_transaction_lflush( + struct btrfs_root *root, int num_items) { - return start_transaction(root, num_blocks, TRANS_JOIN); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_LIMIT); } -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(r, num_blocks, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_JOIN, 0); } -/* wait for a transaction commit to be fully complete */ -static noinline int wait_for_commit(struct btrfs_root *root, - struct btrfs_transaction *commit) +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - DEFINE_WAIT(wait); - mutex_lock(&root->fs_info->trans_mutex); - while (!commit->commit_done) { - prepare_to_wait(&commit->commit_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (commit->commit_done) - break; - mutex_unlock(&root->fs_info->trans_mutex); - schedule(); - mutex_lock(&root->fs_info->trans_mutex); - } - mutex_unlock(&root->fs_info->trans_mutex); - finish_wait(&commit->commit_wait, &wait); - return 0; + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_USERSPACE, 0); } -#if 0 /* - * rate limit against the drop_snapshot code. This helps to slow down new - * operations if the drop_snapshot code isn't able to keep up. + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() */ -static void throttle_on_drops(struct btrfs_root *root) +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - struct btrfs_fs_info *info = root->fs_info; - int harder_count = 0; - -harder: - if (atomic_read(&info->throttles)) { - DEFINE_WAIT(wait); - int thr; - thr = atomic_read(&info->throttle_gen); - - do { - prepare_to_wait(&info->transaction_throttle, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&info->throttles)) { - finish_wait(&info->transaction_throttle, &wait); - break; - } - schedule(); - finish_wait(&info->transaction_throttle, &wait); - } while (thr == atomic_read(&info->throttle_gen)); - harder_count++; + return start_transaction(root, 0, TRANS_ATTACH, 0); +} + +/* + * btrfs_attach_transaction_barrier() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; - if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && - harder_count < 2) - goto harder; + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); - if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && - harder_count < 10) - goto harder; + return trans; +} + +/* wait for a transaction commit to be fully complete */ +static noinline void wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); +} + +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret = 0; - if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && - harder_count < 20) - goto harder; + if (transid) { + if (transid <= root->fs_info->last_trans_committed) + goto out; + + ret = -EINVAL; + /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry(t, &root->fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + atomic_inc(&cur_trans->use_count); + ret = 0; + break; + } + if (t->transid > transid) { + ret = 0; + break; + } + } + spin_unlock(&root->fs_info->trans_lock); + /* The specified transaction doesn't exist */ + if (!cur_trans) + goto out; + } else { + /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry_reverse(t, &root->fs_info->trans_list, + list) { + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) + break; + cur_trans = t; + atomic_inc(&cur_trans->use_count); + break; + } + } + spin_unlock(&root->fs_info->trans_lock); + if (!cur_trans) + goto out; /* nothing committing|committed */ } + + wait_for_commit(root, cur_trans); + btrfs_put_transaction(cur_trans); +out: + return ret; } -#endif void btrfs_throttle(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->open_ioctl_trans) + if (!atomic_read(&root->fs_info->open_ioctl_trans)) wait_current_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); +} + +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (root->fs_info->global_block_rsv.space_info->full && + btrfs_check_space_for_delayed_refs(trans, root)) + return 1; + + return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + int err; + + smp_mb(); + if (cur_trans->state >= TRANS_STATE_BLOCKED || + cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) { + err = btrfs_run_delayed_refs(trans, root, updates); + if (err) /* Error code will also eval true */ + return err; + } + + return should_end_transaction(trans, root); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int throttle) { - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; - int count = 0; - - while (count < 4) { - unsigned long cur = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; + unsigned long cur = trans->delayed_ref_updates; + int lock = (trans->type != TRANS_JOIN_NOLOCK); + int err = 0; + int must_run_delayed_refs = 0; + + if (trans->use_count > 1) { + trans->use_count--; + trans->block_rsv = trans->orig_rsv; + return 0; } - mutex_lock(&info->trans_mutex); - cur_trans = info->running_transaction; - WARN_ON(cur_trans != trans->transaction); - WARN_ON(cur_trans->num_writers < 1); - cur_trans->num_writers--; + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + + if (trans->qgroup_reserved) { + /* + * the same root has to be passed here between start_transaction + * and end_transaction. Subvolume quota depends on this. + */ + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root) && + ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + spin_lock(&info->trans_lock); + if (cur_trans->state == TRANS_STATE_RUNNING) + cur_trans->state = TRANS_STATE_BLOCKED; + spin_unlock(&info->trans_lock); + } + + if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + WARN_ON(cur_trans != info->running_transaction); + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); + + smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); - put_transaction(cur_trans); - mutex_unlock(&info->trans_mutex); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) current->journal_info = NULL; - memset(trans, 0, sizeof(*trans)); - kmem_cache_free(btrfs_trans_handle_cachep, trans); if (throttle) btrfs_run_delayed_iputs(root); - return 0; + if (trans->aborted || + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + wake_up_process(info->transaction_kthread); + err = -EIO; + } + assert_qgroups_uptodate(trans); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } + return err; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -355,50 +814,23 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - while (start <= end) { - cond_resched(); - - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - - btree_lock_page_hook(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - - if (PageWriteback(page)) { - if (PageDirty(page)) - wait_on_page_writeback(page); - else { - unlock_page(page); - page_cache_release(page); - continue; - } - } - err = write_one_page(page, 0); - if (err) - werr = err; - page_cache_release(page); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + cached_state = NULL; + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -414,39 +846,22 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - if (PageDirty(page)) { - btree_lock_page_hook(page); - wait_on_page_writeback(page); - err = write_one_page(page, 0); - if (err) - werr = err; - } - wait_on_page_writeback(page); - page_cache_release(page); - cond_resched(); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { + clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -458,15 +873,23 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { int ret; int ret2; + struct blk_plug plug; + blk_start_plug(&plug); ret = btrfs_write_marked_extents(root, dirty_pages, mark); + blk_finish_plug(&plug); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - return ret || ret2; + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; } int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, @@ -513,21 +936,24 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); + if (ret) + return ret; old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); - BUG_ON(ret); + if (ret) + return ret; } - if (root != root->fs_info->extent_root) - switch_commit_root(root); - return 0; } /* * update all the cowonly tree roots on disk + * + * The error handling in this function may not be obvious. Any of the + * failures will cause the file system to go offline. We still need + * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -538,27 +964,53 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, + 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; + + ret = btrfs_run_dev_stats(trans, root->fs_info); + if (ret) + return ret; + ret = btrfs_run_dev_replace(trans, root->fs_info); + if (ret) + return ret; + ret = btrfs_run_qgroups(trans, root->fs_info); + if (ret) + return ret; + + /* run_qgroups might have added some more refs */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); - update_cowonly_root(trans, root); + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + ret = update_cowonly_root(trans, root); + if (ret) + return ret; } - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); + btrfs_after_dev_replace_commit(fs_info); return 0; } @@ -568,12 +1020,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +void btrfs_add_dead_root(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); - list_add(&root->root_list, &root->fs_info->dead_roots); - mutex_unlock(&root->fs_info->trans_mutex); - return 0; + spin_lock(&root->fs_info->trans_lock); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); + spin_unlock(&root->fs_info->trans_lock); } /* @@ -588,6 +1040,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int ret; int err = 0; + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -600,12 +1053,21 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); + + btrfs_save_ino_cache(root, trans); + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); if (root->commit_root != root->node) { - switch_commit_root(root); + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } @@ -613,140 +1075,60 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; } } + spin_unlock(&fs_info->fs_roots_radix_lock); return err; } /* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged. */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; - int ret; struct btrfs_trans_handle *trans; - unsigned long nr; + int ret; - smp_mb(); - if (root->defrag_running) + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; - trans = btrfs_start_transaction(root, 1); - while (1) { - root->defrag_running = 1; - ret = btrfs_defrag_leaves(trans, root, cacheonly); - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root, nr); - cond_resched(); - - trans = btrfs_start_transaction(root, 1); - if (root->fs_info->closing || ret != -EAGAIN) - break; - } - root->defrag_running = 0; - smp_mb(); - btrfs_end_transaction(trans, root); - return 0; -} - -#if 0 -/* - * when dropping snapshots, we generate a ton of delayed refs, and it makes - * sense not to join the transaction while it is trying to flush the current - * queue of delayed refs out. - * - * This is used by the drop snapshot code only - */ -static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) -{ - DEFINE_WAIT(wait); - - mutex_lock(&info->trans_mutex); - while (info->running_transaction && - info->running_transaction->delayed_refs.flushing) { - prepare_to_wait(&info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&info->trans_mutex); - - schedule(); - - mutex_lock(&info->trans_mutex); - finish_wait(&info->transaction_wait, &wait); - } - mutex_unlock(&info->trans_mutex); - return 0; -} - -/* - * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on - * all of them - */ -int btrfs_drop_dead_root(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = root->fs_info->tree_root; - unsigned long nr; - int ret; while (1) { - /* - * we don't want to jump in and create a bunch of - * delayed refs if the transaction is starting to close - */ - wait_transaction_pre_flush(tree_root->fs_info); - trans = btrfs_start_transaction(tree_root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); - /* - * we've joined a transaction, make sure it isn't - * closing right now - */ - if (trans->transaction->delayed_refs.flushing) { - btrfs_end_transaction(trans, tree_root); - continue; - } + ret = btrfs_defrag_leaves(trans, root); - ret = btrfs_drop_snapshot(trans, root); - if (ret != -EAGAIN) - break; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root); + cond_resched(); - ret = btrfs_update_root(trans, tree_root, - &root->root_key, - &root->root_item); - if (ret) + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); - - btrfs_btree_balance_dirty(tree_root, nr); - cond_resched(); + if (btrfs_defrag_cancelled(root->fs_info)) { + pr_debug("BTRFS: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } } - BUG_ON(ret); - - ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); - - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); - - btrfs_btree_balance_dirty(tree_root, nr); + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } -#endif /* * new snapshots need to be created at a very specific time in the - * transaction commit. This does the actual creation + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -757,79 +1139,253 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; + struct btrfs_block_rsv *rsv; struct inode *parent_inode; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; - int ret; - u64 objectid; - int namelen; + struct timespec cur_time = CURRENT_TIME; + int ret = 0; + u64 to_reserve = 0; u64 index = 0; + u64 objectid; + u64 root_flags; + uuid_le new_uuid; - parent_inode = pending->dentry->d_parent->d_inode; - parent_root = BTRFS_I(parent_inode)->root; + path = btrfs_alloc_path(); + if (!path) { + pending->error = -ENOMEM; + return 0; + } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = -ENOMEM; - goto fail; + pending->error = -ENOMEM; + goto root_item_alloc_fail; + } + + pending->error = btrfs_find_free_objectid(tree_root, &objectid); + if (pending->error) + goto no_free_objectid; + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + pending->error = btrfs_block_rsv_add(root, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) + goto no_free_objectid; } - ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); - if (ret) - goto fail; key.objectid = objectid; - /* record when the snapshot was created in key.offset */ - key.offset = trans->transid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; - memcpy(&pending->root_key, &key, sizeof(key)); - pending->root_key.offset = (u64)-1; + rsv = trans->block_rsv; + trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; + dentry = pending->dentry; + parent_inode = pending->dir; + parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); + /* * insert the directory item */ - namelen = strlen(pending->name); ret = btrfs_set_inode_index(parent_inode, &index); - BUG_ON(ret); - ret = btrfs_insert_dir_item(trans, parent_root, - pending->name, namelen, - parent_inode->i_ino, - &pending->root_key, BTRFS_FT_DIR, index); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ + + /* check if there is a file/dir which has the same name. */ + dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, + btrfs_ino(parent_inode), + dentry->d_name.name, + dentry->d_name.len, 0); + if (dir_item != NULL && !IS_ERR(dir_item)) { + pending->error = -EEXIST; + goto dir_item_existed; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + btrfs_release_path(path); - btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); - ret = btrfs_update_inode(trans, parent_root, parent_inode); - BUG_ON(ret); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + if (ret) { /* Transaction aborted */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); + + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + + btrfs_set_root_generation_v2(new_root_item, + trans->transid); + uuid_le_gen(&new_uuid); + memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + memcpy(new_root_item->parent_uuid, root->root_item.uuid, + BTRFS_UUID_SIZE); + if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { + memset(new_root_item->received_uuid, 0, + sizeof(new_root_item->received_uuid)); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + } + btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); + btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + if (ret) { + btrfs_tree_unlock(old); + free_extent_buffer(old); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + btrfs_set_lock_blocking(old); - btrfs_copy_root(trans, root, old, &tmp, objectid); + ret = btrfs_copy_root(trans, root, old, &tmp, objectid); + /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_wmb(); btrfs_set_root_node(new_root_item, tmp); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - new_root_item); - BUG_ON(ret); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - pending->root_key.objectid, + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, parent_root->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); - BUG_ON(ret); + btrfs_ino(parent_inode), index, + dentry->d_name.name, dentry->d_name.len); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_insert_dir_item(trans, parent_root, + dentry->d_name.name, dentry->d_name.len, + parent_inode, &key, + BTRFS_FT_DIR, index); + /* We have check then name at the beginning, so it is impossible. */ + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, + BTRFS_UUID_KEY_SUBVOL, objectid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + new_root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + } fail: + pending->error = ret; +dir_item_existed: + trans->block_rsv = rsv; + trans->bytes_reserved = 0; +no_free_objectid: kfree(new_root_item); +root_item_alloc_fail: + btrfs_free_path(path); return ret; } @@ -839,15 +1395,17 @@ fail: static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_pending_snapshot *pending; + struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; - int ret; + int ret = 0; - list_for_each_entry(pending, head, list) { + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); ret = create_pending_snapshot(trans, fs_info, pending); - BUG_ON(ret); + if (ret) + break; } - return 0; + return ret; } static void update_super_roots(struct btrfs_root *root) @@ -855,7 +1413,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -866,130 +1424,398 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; + if (btrfs_test_opt(root, SPACE_CACHE)) + super->cache_generation = root_item->generation; + if (root->fs_info->update_uuid_tree_gen) + super->uuid_tree_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; + int ret = 0; + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); + spin_unlock(&info->trans_lock); + return ret; +} + +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + struct btrfs_transaction *trans; int ret = 0; - spin_lock(&info->new_trans_lock); - if (info->running_transaction) - ret = info->running_transaction->in_commit; - spin_unlock(&info->new_trans_lock); + + spin_lock(&info->trans_lock); + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); + spin_unlock(&info->trans_lock); return ret; } +/* + * wait for the current transaction commit to start and block subsequent + * transaction joins + */ +static void wait_current_trans_commit_start(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + wait_event(root->fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || + trans->aborted); +} + +/* + * wait for the current transaction to start and then become unblocked. + * caller holds ref. + */ +static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + wait_event(root->fs_info->transaction_wait, + trans->state >= TRANS_STATE_UNBLOCKED || + trans->aborted); +} + +/* + * commit transactions asynchronously. once btrfs_commit_transaction_async + * returns, any subsequent transaction will not be allowed to join. + */ +struct btrfs_async_commit { + struct btrfs_trans_handle *newtrans; + struct btrfs_root *root; + struct work_struct work; +}; + +static void do_async_commit(struct work_struct *work) +{ + struct btrfs_async_commit *ac = + container_of(work, struct btrfs_async_commit, work); + + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + if (ac->newtrans->type & __TRANS_FREEZABLE) + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + current->journal_info = ac->newtrans; + + btrfs_commit_transaction(ac->newtrans, ac->root); + kfree(ac); +} + +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock) +{ + struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + + ac = kmalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + return -ENOMEM; + + INIT_WORK(&ac->work, do_async_commit); + ac->root = root; + ac->newtrans = btrfs_join_transaction(root); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } + + /* take transaction reference */ + cur_trans = trans->transaction; + atomic_inc(&cur_trans->use_count); + + btrfs_end_transaction(trans, root); + + /* + * Tell lockdep we've released the freeze rwsem, since the + * async commit thread will be the one to unlock it. + */ + if (ac->newtrans->type & __TRANS_FREEZABLE) + rwsem_release( + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + + schedule_work(&ac->work); + + /* wait for transaction to start and unblock */ + if (wait_for_unblock) + wait_current_trans_commit_start_and_unblock(root, cur_trans); + else + wait_current_trans_commit_start(root, cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + + btrfs_put_transaction(cur_trans); + return 0; +} + + +static void cleanup_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int err) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); + + WARN_ON(trans->use_count > 1); + + btrfs_abort_transaction(trans, root, err); + + spin_lock(&root->fs_info->trans_lock); + + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); + + list_del_init(&cur_trans->list); + if (cur_trans == root->fs_info->running_transaction) { + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); + } + spin_unlock(&root->fs_info->trans_lock); + + btrfs_cleanup_one_transaction(trans->transaction, root); + + spin_lock(&root->fs_info->trans_lock); + if (cur_trans == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); + + trace_btrfs_transaction_commit(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + btrfs_scrub_cancel(root->fs_info); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); +} + +static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + ret = btrfs_run_ordered_operations(trans, root, 1); + + return ret; +} + +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + return btrfs_start_delalloc_roots(fs_info, 1, -1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + btrfs_wait_ordered_roots(fs_info, -1); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; - unsigned long timeout = 1; - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + btrfs_end_transaction(trans, root); + return ret; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } cur_trans = trans->transaction; + /* * set the flushing flag so procs in this transaction have to * start sending their work down. */ cur_trans->delayed_refs.flushing = 1; + smp_wmb(); - ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - if (cur_trans->in_commit) { - cur_trans->use_count++; - mutex_unlock(&root->fs_info->trans_mutex); + ret = btrfs_run_delayed_refs(trans, root, 0); + if (ret) { btrfs_end_transaction(trans, root); + return ret; + } - ret = wait_for_commit(root, cur_trans); - BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + spin_unlock(&root->fs_info->trans_lock); + atomic_inc(&cur_trans->use_count); + ret = btrfs_end_transaction(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); + wait_for_commit(root, cur_trans); - return 0; + btrfs_put_transaction(cur_trans); + + return ret; } - trans->transaction->in_commit = 1; - trans->transaction->blocked = 1; + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&root->fs_info->transaction_blocked_wait); + if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (!prev_trans->commit_done) { - prev_trans->use_count++; - mutex_unlock(&root->fs_info->trans_mutex); + if (prev_trans->state != TRANS_STATE_COMPLETED) { + atomic_inc(&prev_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); - mutex_lock(&root->fs_info->trans_mutex); - put_transaction(prev_trans); + btrfs_put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } + } else { + spin_unlock(&root->fs_info->trans_lock); } - if (now < cur_trans->start_time || now - cur_trans->start_time < 1) - should_grow = 1; + extwriter_counter_dec(cur_trans, trans->type); - do { - int snap_pending = 0; - joined = cur_trans->num_joined; - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; - - WARN_ON(cur_trans != trans->transaction); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); - - if (cur_trans->num_writers > 1) - timeout = MAX_SCHEDULE_TIMEOUT; - else if (should_grow) - timeout = 1; - - mutex_unlock(&root->fs_info->trans_mutex); - - if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); - ret = btrfs_wait_ordered_extents(root, 0, 1); - BUG_ON(ret); - } + ret = btrfs_start_delalloc_flush(root->fs_info); + if (ret) + goto cleanup_transaction; - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; - smp_mb(); - if (cur_trans->num_writers > 1 || should_grow) - schedule_timeout(timeout); + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); - mutex_lock(&root->fs_info->trans_mutex); - finish_wait(&cur_trans->writer_wait, &wait); - } while (cur_trans->num_writers > 1 || - (should_grow && cur_trans->num_joined != joined)); + /* some pending stuffs might be added after the previous flush. */ + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; + + btrfs_wait_delalloc_flush(root->fs_info); + + btrfs_scrub_pause(root); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_COMMIT_DOING; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto scrub_continue; + } + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + /* + * We needn't worry about the delayed items because we will + * deal with them in create_pending_snapshot(), which is the + * core function of the snapshot creation. + */ ret = create_pending_snapshots(trans, root->fs_info); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * We insert the dir indexes of the snapshots and update the inode + * of the snapshots' parents after the snapshot creation, so there + * are some delayed items which are not dealt with. Now deal with + * them. + * + * We needn't worry that this operation will corrupt the snapshots, + * because all the tree which are snapshoted will be forced to COW + * the nodes and leaves. + */ + ret = btrfs_run_delayed_items(trans, root); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); WARN_ON(cur_trans != trans->transaction); @@ -1009,7 +1835,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); ret = commit_fs_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * Since the transaction is done, we should set the inode map cache flag + * before any other comming transaction. + */ + if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) + btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + else + btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -1017,41 +1856,68 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_free_log_root_tree(trans, root->fs_info); ret = commit_cowonly_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); - update_super_roots(root); + switch_commit_roots(cur_trans, root->fs_info); - if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); - } + assert_qgroups_uptodate(trans); + update_super_roots(root); - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); - trans->transaction->blocked = 0; + spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); ret = btrfs_write_and_wait_transaction(trans, root); - BUG_ON(ret); - write_ctree_super(trans, root, 0); + if (ret) { + btrfs_error(root->fs_info, ret, + "Error while writing out transaction"); + mutex_unlock(&root->fs_info->tree_log_mutex); + goto scrub_continue; + } + + ret = write_ctree_super(trans, root, 0); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto scrub_continue; + } /* * the super is written, we can safely allow the tree-loggers @@ -1061,18 +1927,27 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->last_trans_committed = cur_trans->transid; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); - cur_trans->commit_done = 1; + spin_lock(&root->fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); - root->fs_info->last_trans_committed = cur_trans->transid; + btrfs_put_transaction(cur_trans); + btrfs_put_transaction(cur_trans); - wake_up(&cur_trans->commit_wait); + if (trans->type & __TRANS_FREEZABLE) + sb_end_intwrite(root->fs_info->sb); - put_transaction(cur_trans); - put_transaction(cur_trans); + trace_btrfs_transaction_commit(root); - mutex_unlock(&root->fs_info->trans_mutex); + btrfs_scrub_continue(root); if (current->journal_info == trans) current->journal_info = NULL; @@ -1083,29 +1958,61 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); return ret; + +scrub_continue: + btrfs_scrub_continue(root); +cleanup_transaction: + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); + if (current->journal_info == trans) + current->journal_info = NULL; + cleanup_transaction(trans, root, ret); + + return ret; } /* - * interface function to delete all the snapshots we have scheduled for deletion + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don't mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. */ -int btrfs_clean_old_snapshots(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) { - LIST_HEAD(list); + int ret; struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->trans_mutex); - list_splice_init(&fs_info->dead_roots, &list); - mutex_unlock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&fs_info->trans_lock); - while (!list_empty(&list)) { - root = list_entry(list.next, struct btrfs_root, root_list); - list_del(&root->root_list); + pr_debug("BTRFS: cleaner removing %llu\n", root->objectid); - if (btrfs_header_backref_rev(root->node) < - BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, 0); - else - btrfs_drop_snapshot(root, 1); - } - return 0; + btrfs_kill_all_delayed_nodes(root); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, NULL, 0, 0); + else + ret = btrfs_drop_snapshot(root, NULL, 1, 0); + /* + * If we encounter a transaction abort during snapshot cleaning, we + * don't want to crash here + */ + return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 93c7ccb3311..7dd558ed071 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -20,97 +20,155 @@ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" #include "delayed-ref.h" +#include "ctree.h" + +enum btrfs_trans_state { + TRANS_STATE_RUNNING = 0, + TRANS_STATE_BLOCKED = 1, + TRANS_STATE_COMMIT_START = 2, + TRANS_STATE_COMMIT_DOING = 3, + TRANS_STATE_UNBLOCKED = 4, + TRANS_STATE_COMPLETED = 5, + TRANS_STATE_MAX = 6, +}; struct btrfs_transaction { u64 transid; /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* * total writers in this transaction, it must be zero before the * transaction can end */ - unsigned long num_writers; + atomic_t num_writers; + atomic_t use_count; - unsigned long num_joined; - int in_commit; - int use_count; - int commit_done; - int blocked; + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct list_head ordered_operations; + struct list_head pending_chunks; + struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; + int aborted; }; +#define __TRANS_FREEZABLE (1U << 0) + +#define __TRANS_USERSPACE (1U << 8) +#define __TRANS_START (1U << 9) +#define __TRANS_ATTACH (1U << 10) +#define __TRANS_JOIN (1U << 11) +#define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) + +#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) + +#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ + __TRANS_ATTACH) + +#define BTRFS_SEND_TRANS_STUB 1 + struct btrfs_trans_handle { u64 transid; + u64 bytes_reserved; + u64 qgroup_reserved; + unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; - struct btrfs_transaction *transaction; - u64 block_group; - u64 alloc_exclude_start; - u64 alloc_exclude_nr; unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; + short aborted; + short adding_csums; + bool allocating_chunk; + bool reloc_reserved; + bool sync; + unsigned int type; + /* + * this root is only needed to validate that the root passed to + * start_transaction is the same as the one passed to end_transaction. + * Subvolume quota depends on this + */ + struct btrfs_root *root; + struct seq_list delayed_ref_elem; + struct list_head qgroup_ref_list; + struct list_head new_bgs; }; struct btrfs_pending_snapshot { struct dentry *dentry; + struct inode *dir; struct btrfs_root *root; - char *name; - struct btrfs_key root_key; + struct btrfs_root *snap; + struct btrfs_qgroup_inherit *inherit; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; + /* extra metadata reseration for relocation */ + int error; + bool readonly; struct list_head list; }; -static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - trans->block_group = BTRFS_I(inode)->block_group; -} - -static inline void btrfs_update_inode_block_group( - struct btrfs_trans_handle *trans, - struct inode *inode) -{ - BTRFS_I(inode)->block_group = trans->block_group; -} - static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); + int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_lflush( + struct btrfs_root *root, int num_items); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_drop_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); -int btrfs_clean_old_snapshots(struct btrfs_root *root); +void btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void btrfs_put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b10eacdb162..a63719cc957 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,28 +23,24 @@ #include "transaction.h" #include "locking.h" -/* defrag all the leaves in a given btree. If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to * better reflect disk order */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only) + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; int ret = 0; int wret; int level; - int orig_level; - int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; u64 min_trans = 0; - if (cache_only) - goto out; - if (root->fs_info->extent_root == root) { /* * there's recursion here right now in the tree locking, @@ -53,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0 && !is_extent) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; if (btrfs_test_opt(root, SSD)) @@ -64,7 +60,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, return -ENOMEM; level = btrfs_header_level(root->node); - orig_level = level; if (level == 0) goto out; @@ -88,18 +83,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } path->keep_locks = 1; - if (cache_only) - min_trans = root->defrag_trans_start; - ret = btrfs_search_forward(root, &key, NULL, path, - cache_only, min_trans); + ret = btrfs_search_forward(root, &key, path, min_trans); if (ret < 0) goto out; if (ret > 0) { ret = 0; goto out; } - btrfs_release_path(root, path); + btrfs_release_path(path); wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -111,19 +103,20 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + next_key_ret = btrfs_find_next_key(root, path, &key, 1, min_trans); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, - cache_only, &last_ret, + &last_ret, &root->defrag_progress); - WARN_ON(ret && ret != -EAGAIN); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } - - btrfs_release_path(root, path); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index af57dd2b43d..9e1f2cd5e67 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,13 +18,14 @@ #include <linux/sched.h> #include <linux/slab.h> -#include "ctree.h" -#include "transaction.h" +#include <linux/blkdev.h> +#include <linux/list_sort.h> +#include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" -#include "compat.h" -#include "tree-log.h" +#include "backref.h" +#include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -89,7 +90,8 @@ */ #define LOG_WALK_PIN_ONLY 0 #define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_ALL 2 +#define LOG_WALK_REPLAY_DIR_INDEX 2 +#define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, @@ -132,40 +134,61 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; mutex_lock(&root->log_mutex); if (root->log_root) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + ret = -EAGAIN; + goto out; + } if (!root->log_start_pid) { root->log_start_pid = current->pid; - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } mutex_unlock(&root->log_mutex); return 0; } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; + + ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { + if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); - BUG_ON(ret); - } + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); - BUG_ON(ret); + if (ret) + goto out; } - mutex_unlock(&root->fs_info->tree_log_mutex); - root->log_batch++; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } +out: mutex_unlock(&root->log_mutex); - return 0; + return ret; } /* @@ -209,14 +232,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root) * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -int btrfs_end_log_trans(struct btrfs_root *root) +void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } - return 0; } @@ -272,17 +294,31 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { + int ret = 0; + + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + } + if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, + eb->start, eb->len); - if (btrfs_buffer_uptodate(eb, gen)) { + if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { + if (wc->pin && btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(log, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) btrfs_wait_tree_block_writeback(eb); } - return 0; + return ret; } /* @@ -312,6 +348,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, unsigned long src_ptr; unsigned long dst_ptr; int overwrite_root = 0; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; @@ -321,6 +358,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, /* look for the key in the destination tree */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { char *src_copy; char *dst_copy; @@ -330,11 +370,17 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, goto insert; if (item_size == 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); return 0; } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } read_extent_buffer(eb, src_copy, src_ptr, item_size); @@ -352,13 +398,57 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, * sync */ if (ret == 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); return 0; } + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + u32 mode; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + + /* + * If this is a directory we need to reset the i_size to + * 0 so that we can set it up properly when replaying + * the rest of the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + u32 mode; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); + + /* + * If this is a directory we need to reset the i_size to 0 so + * that we can set it up properly when replaying the rest of + * the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); } insert: - btrfs_release_path(root, path); + btrfs_release_path(path); /* try to insert the key into the destination tree */ ret = btrfs_insert_empty_item(trans, root, path, key, item_size); @@ -368,15 +458,13 @@ insert: u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (found_size > item_size) { - btrfs_truncate_item(trans, root, path, item_size, 1); - } else if (found_size < item_size) { - ret = btrfs_extend_item(trans, root, path, - item_size - found_size); - BUG_ON(ret); - } + if (found_size > item_size) + btrfs_truncate_item(root, path, item_size, 1); + else if (found_size < item_size) + btrfs_extend_item(root, path, + item_size - found_size); } else if (ret) { - BUG(); + return ret; } dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); @@ -429,7 +517,7 @@ insert: } no_copy: btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(root, path); + btrfs_release_path(path); return 0; } @@ -475,11 +563,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key *key) { int found_type; - u64 mask = root->sectorsize - 1; u64 extent_end; - u64 alloc_hint; u64 start = key->offset; - u64 saved_nbytes; + u64 nbytes = 0; struct btrfs_file_extent_item *item; struct inode *inode = NULL; unsigned long size; @@ -489,11 +575,20 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) - extent_end = start + btrfs_file_extent_num_bytes(eb, item); - else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, slot, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); + extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; @@ -510,7 +605,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && @@ -535,17 +630,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * we don't have to do anything */ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto out; } } - btrfs_release_path(root, path); + btrfs_release_path(path); - saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, inode, start, extent_end, - &alloc_hint, 1); - BUG_ON(ret); + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); + if (ret) + goto out; if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -555,7 +649,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - BUG_ON(ret); + if (ret) + goto out; dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); copy_extent_buffer(path->nodes[0], eb, dest_offset, @@ -580,7 +675,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0); + if (ret) + goto out; } else { /* * insert the extent pointer in the extent @@ -589,9 +686,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_logged_file_extent(trans, root, root->root_key.objectid, key->objectid, offset, &ins); - BUG_ON(ret); + if (ret) + goto out; } - btrfs_release_path(root, path); + btrfs_release_path(path); if (btrfs_file_extent_compression(eb, item)) { csum_start = ins.objectid; @@ -605,31 +703,35 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, - &ordered_sums); - BUG_ON(ret); + &ordered_sums, 0); + if (ret) + goto out; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, + if (!ret) + ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); - BUG_ON(ret); list_del(&sums->list); kfree(sums); } + if (ret) + goto out; } else { - btrfs_release_path(root, path); + btrfs_release_path(path); } } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); + if (ret) + goto out; } - inode_set_bytes(inode, saved_nbytes); - btrfs_update_inode(trans, root, inode); + inode_add_bytes(inode, nbytes); + ret = btrfs_update_inode(trans, root, inode); out: if (inode) iput(inode); @@ -662,19 +764,29 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); - btrfs_release_path(root, path); + btrfs_release_path(path); inode = read_one_inode(root, location.objectid); - BUG_ON(!inode); + if (!inode) { + ret = -EIO; + goto out; + } ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); + if (ret) + goto out; ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); + if (ret) + goto out; + else + ret = btrfs_run_delayed_items(trans, root); +out: kfree(name); - iput(inode); return ret; } @@ -701,7 +813,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, goto out; } else goto out; - btrfs_release_path(root, path); + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); if (di && !IS_ERR(di)) { @@ -712,7 +824,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, goto out; match = 1; out: - btrfs_release_path(root, path); + btrfs_release_path(path); return match; } @@ -728,6 +840,7 @@ out: */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, + u64 ref_objectid, char *name, int namelen) { struct btrfs_path *path; @@ -741,12 +854,24 @@ static noinline int backref_in_log(struct btrfs_root *log, int match = 0; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, namelen, NULL)) + match = 1; + + goto out; + } + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr_end = ptr + item_size; while (ptr < ptr_end) { ref = (struct btrfs_inode_ref *)ptr; @@ -767,89 +892,42 @@ out: return match; } - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_root *log, struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) + struct btrfs_root *log_root, + struct inode *dir, struct inode *inode, + struct extent_buffer *eb, + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, char *name, int namelen, + int *search_done) { - struct inode *dir; int ret; - struct btrfs_key location; - struct btrfs_inode_ref *ref; + char *victim_name; + int victim_name_len; + struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct inode *inode; - char *name; - int namelen; - unsigned long ref_ptr; - unsigned long ref_end; - - location.objectid = key->objectid; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - /* - * it is possible that we didn't log all the parent directories - * for a given inode. If we don't find the dir, just don't - * copy the back ref in. The link count fixup code will take - * care of the rest - */ - dir = read_one_inode(root, key->offset); - if (!dir) - return -ENOENT; - - inode = read_one_inode(root, key->objectid); - BUG_ON(!inode); - - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; again: - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - - /* if we already have a perfect match, we're done */ - if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, - btrfs_inode_ref_index(eb, ref), - name, namelen)) { - goto out; - } - - /* - * look for a conflicting back reference in the metadata. - * if we find one we have to unlink that name of the file - * before we add our new link. Later on, we overwrite any - * existing back reference, and we don't want to create - * dangling pointers in the directory. - */ -conflict_again: - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret == 0) { - char *victim_name; - int victim_name_len; struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; - struct extent_buffer *leaf = path->nodes[0]; + + leaf = path->nodes[0]; /* are we trying to overwrite a back ref for the root directory * if so, just jump out, we're done */ - if (key->objectid == key->offset) - goto out_nowrite; + if (search_key.objectid == search_key.offset) + return 1; /* check all the names in this back reference to see * if they are in the log. if so, we allow them to stay @@ -862,114 +940,385 @@ conflict_again: victim_name_len = btrfs_inode_ref_name_len(leaf, victim_ref); victim_name = kmalloc(victim_name_len, GFP_NOFS); - BUG_ON(!victim_name); + if (!victim_name) + return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log, key, victim_name, + if (!backref_in_log(log_root, &search_key, + parent_objectid, + victim_name, victim_name_len)) { - btrfs_inc_nlink(inode); - btrfs_release_path(root, path); + inc_nlink(inode); + btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); kfree(victim_name); - btrfs_release_path(root, path); - goto conflict_again; + if (ret) + return ret; + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + *search_done = 1; + goto again; } kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } - BUG_ON(ret); + + /* + * NOTE: we have searched root tree and checked the + * coresponding ref, it does not need to check again. + */ + *search_done = 1; } - btrfs_release_path(root, path); + btrfs_release_path(path); + + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + inode_objectid, parent_objectid, 0, + 0); + if (!IS_ERR_OR_NULL(extref)) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)base + cur_offset; + + victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + victim_name = kmalloc(victim_name_len, GFP_NOFS); + if (!victim_name) + return -ENOMEM; + read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, + victim_name_len); + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name, + victim_name_len); + ret = 0; + if (!backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len)) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, + victim_parent, + inode, + victim_name, + victim_name_len); + if (!ret) + ret = btrfs_run_delayed_items( + trans, root); + } + iput(victim_parent); + kfree(victim_name); + if (ret) + return ret; + *search_done = 1; + goto again; + } + kfree(victim_name); + if (ret) + return ret; +next: + cur_offset += victim_name_len + sizeof(*extref); + } + *search_done = 1; + } + btrfs_release_path(path); /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } - btrfs_release_path(root, path); + btrfs_release_path(path); - - /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } - btrfs_release_path(root, path); + btrfs_release_path(path); - /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, - btrfs_inode_ref_index(eb, ref)); - BUG_ON(ret); + return 0; +} - btrfs_update_inode(trans, root, inode); +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; -out: - ref_ptr = (unsigned long)(ref + 1) + namelen; - kfree(name); - if (ref_ptr < ref_end) - goto again; + extref = (struct btrfs_inode_extref *)ref_ptr; + + *namelen = btrfs_inode_extref_name_len(eb, extref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)&extref->name, + *namelen); + + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) +{ + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + *namelen = btrfs_inode_ref_name_len(eb, ref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir = NULL; + struct inode *inode = NULL; + unsigned long ref_ptr; + unsigned long ref_end; + char *name = NULL; + int namelen; + int ret; + int search_done = 0; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index = 0; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + + inode = read_one_inode(root, inode_objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + while (ref_ptr < ref_end) { + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) { + ret = -ENOENT; + goto out; + } + } else { + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); + } + if (ret) + goto out; + + /* if we already have a perfect match, we're done */ + if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, name, namelen)) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + + if (!search_done) { + ret = __add_inode_ref(trans, root, path, log, + dir, inode, eb, + inode_objectid, + parent_objectid, + ref_index, name, namelen, + &search_done); + if (ret) { + if (ret == 1) + ret = 0; + goto out; + } + } + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, + 0, ref_index); + if (ret) + goto out; + + btrfs_update_inode(trans, root, inode); + } + + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; + kfree(name); + name = NULL; + if (log_ref_ver) { + iput(dir); + dir = NULL; + } + } /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); - -out_nowrite: - btrfs_release_path(root, path); +out: + btrfs_release_path(path); + kfree(name); iput(dir); iput(inode); - return 0; + return ret; } static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_orphan_item(root, offset); + ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; } +static int count_inode_extrefs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay. So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found. If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) { - struct btrfs_path *path; int ret; struct btrfs_key key; - u64 nlink = 0; + unsigned int nlink = 0; unsigned long ptr; unsigned long ptr_end; int name_len; + u64 ino = btrfs_ino(inode); - key.objectid = inode->i_ino; + key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - path = btrfs_alloc_path(); - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -979,9 +1328,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, break; path->slots[0]--; } +process_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != inode->i_ino || + if (key.objectid != ino || key.type != BTRFS_INODE_REF_KEY) break; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); @@ -999,12 +1349,60 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (key.offset == 0) break; + if (path->slots[0] > 0) { + path->slots[0]--; + goto process_slot; + } key.offset--; - btrfs_release_path(root, path); + btrfs_release_path(path); } - btrfs_release_path(root, path); + btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, inode, path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, inode, path); + if (ret == -ENOENT) + ret = 0; + + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1012,15 +1410,16 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->i_nlink == 0) { if (S_ISDIR(inode->i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, - inode->i_ino, 1); - BUG_ON(ret); + ino, 1); + if (ret) + goto out; } - ret = insert_orphan_item(trans, root, inode->i_ino); - BUG_ON(ret); + ret = insert_orphan_item(trans, root, ino); } - btrfs_free_path(path); - return 0; +out: + btrfs_free_path(path); + return ret; } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, @@ -1051,16 +1450,18 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); + if (ret) + goto out; - btrfs_release_path(root, path); + btrfs_release_path(path); inode = read_one_inode(root, key.offset); - BUG_ON(!inode); + if (!inode) + return -EIO; ret = fixup_inode_link_count(trans, root, inode); - BUG_ON(ret); - iput(inode); + if (ret) + goto out; /* * fixup on a directory may create new entries, @@ -1069,8 +1470,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - btrfs_release_path(root, path); - return 0; + ret = 0; +out: + btrfs_release_path(path); + return ret; } @@ -1089,7 +1492,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct inode *inode; inode = read_one_inode(root, objectid); - BUG_ON(!inode); + if (!inode) + return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); @@ -1097,14 +1501,17 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - btrfs_release_path(root, path); + btrfs_release_path(path); if (ret == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(trans, root, inode); + if (!inode->i_nlink) + set_nlink(inode, 1); + else + inc_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; } else { - BUG(); + BUG(); /* Logic Error */ } iput(inode); @@ -1136,6 +1543,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, iput(inode); return -EIO; } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); /* FIXME, put inode into FIXUP list */ @@ -1173,13 +1581,20 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct inode *dir; u8 log_type; int exists; - int ret; + int ret = 0; + bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); dir = read_one_inode(root, key->objectid); - BUG_ON(!dir); + if (!dir) + return -EIO; name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); @@ -1190,7 +1605,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, exists = 1; else exists = 0; - btrfs_release_path(root, path); + btrfs_release_path(path); if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, @@ -1201,9 +1616,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, key->offset, name, name_len, 1); } else { - BUG(); + /* Corruption */ + ret = -EINVAL; + goto out; } - if (!dst_di || IS_ERR(dst_di)) { + if (IS_ERR_OR_NULL(dst_di)) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ @@ -1229,22 +1646,29 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; ret = drop_one_dir_item(trans, root, path, dir, dst_di); - BUG_ON(ret); + if (ret) + goto out; if (key->type == BTRFS_DIR_INDEX_KEY) goto insert; out: - btrfs_release_path(root, path); + btrfs_release_path(path); + if (!ret && update_size) { + btrfs_i_size_write(dir, dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, dir); + } kfree(name); iput(dir); - return 0; + return ret; insert: - btrfs_release_path(root, path); + btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + goto out; + update_size = false; + ret = 0; goto out; } @@ -1271,9 +1695,12 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - BUG_ON(ret); + if (ret) + return ret; ptr = (unsigned long)(di + 1); ptr += name_len; } @@ -1359,7 +1786,7 @@ next: *end_ret = found_end; ret = 0; out: - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } @@ -1397,6 +1824,11 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -1417,22 +1849,33 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || IS_ERR(log_di)) { + if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(root, path); - btrfs_release_path(log, log_path); + btrfs_release_path(path); + btrfs_release_path(log_path); inode = read_one_inode(root, location.objectid); - BUG_ON(!inode); + if (!inode) { + kfree(name); + return -EIO; + } ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); - btrfs_inc_nlink(inode); + if (ret) { + kfree(name); + iput(inode); + goto out; + } + + inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); + if (!ret) + ret = btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); + if (ret) + goto out; /* there might still be more names under this key * check and repeat if required @@ -1443,8 +1886,11 @@ again: goto again; ret = 0; goto out; + } else if (IS_ERR(log_di)) { + kfree(name); + return PTR_ERR(log_di); } - btrfs_release_path(log, log_path); + btrfs_release_path(log_path); kfree(name); ptr = (unsigned long)(di + 1); @@ -1452,8 +1898,8 @@ again: } ret = 0; out: - btrfs_release_path(root, path); - btrfs_release_path(log, log_path); + btrfs_release_path(path); + btrfs_release_path(log_path); return ret; } @@ -1536,12 +1982,13 @@ again: ret = check_item_in_log(trans, root, log, path, log_path, dir, &found_key); - BUG_ON(ret); + if (ret) + goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } - btrfs_release_path(root, path); + btrfs_release_path(path); if (range_end == (u64)-1) break; range_start = range_end + 1; @@ -1552,11 +1999,11 @@ next_type: if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { key_type = BTRFS_DIR_LOG_INDEX_KEY; dir_key.type = BTRFS_DIR_INDEX_KEY; - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } out: - btrfs_release_path(root, path); + btrfs_release_path(path); btrfs_free_path(log_path); iput(dir); return ret; @@ -1580,12 +2027,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; - u32 item_size; int level; int i; int ret; - btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; level = btrfs_header_level(eb); @@ -1593,12 +2041,12 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { btrfs_item_key_to_cpu(eb, &key, i); - item_size = btrfs_item_size_nr(eb, i); /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && @@ -1612,11 +2060,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, root, log, path, key.objectid, 0); - BUG_ON(ret); + if (ret) + break; } ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; /* for regular files, make sure corresponding * orhpan item exist. extents past the new EOF @@ -1625,13 +2075,24 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISREG(mode)) { ret = insert_orphan_item(wc->trans, root, key.objectid); - BUG_ON(ret); + if (ret) + break; } ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); - BUG_ON(ret); + if (ret) + break; } + + if (key.type == BTRFS_DIR_INDEX_KEY && + wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + if (wc->stage < LOG_WALK_REPLAY_ALL) continue; @@ -1639,24 +2100,29 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (key.type == BTRFS_XATTR_ITEM_KEY) { ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); - } else if (key.type == BTRFS_INODE_REF_KEY) { + if (ret) + break; + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + break; + ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); - } else if (key.type == BTRFS_DIR_ITEM_KEY || - key.type == BTRFS_DIR_INDEX_KEY) { + if (ret) + break; + } else if (key.type == BTRFS_DIR_ITEM_KEY) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } } btrfs_free_path(path); - return 0; + return ret; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -1665,7 +2131,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -1682,8 +2147,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; - if (btrfs_header_level(cur) != *level) - WARN_ON(1); + WARN_ON(btrfs_header_level(cur) != *level); if (path->slots[*level] >= btrfs_header_nritems(cur)) @@ -1695,33 +2159,51 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); - - wc->process_func(root, next, wc, ptr_gen); + if (!next) + return -ENOMEM; if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } + path->slots[*level]++; if (wc->free) { - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); + if (ret) { + free_extent_buffer(next); + return ret; + } } free_extent_buffer(next); continue; } - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } WARN_ON(*level <= 0); if (path->nodes[*level-1]) @@ -1734,35 +2216,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - bytenr = path->nodes[*level]->start; - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); - - if (wc->free) { - next = path->nodes[*level]; - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); return 0; @@ -1774,16 +2228,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; int i; int slot; int ret; for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { - struct extent_buffer *node; - node = path->nodes[i]; + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { path->slots[i]++; *level = i; WARN_ON(*level == 0); @@ -1796,25 +2247,30 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); + if (ret) + return ret; + if (wc->free) { struct extent_buffer *next; next = path->nodes[*level]; - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); - BUG_ON(ret); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -1836,11 +2292,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, int wret; int level; struct btrfs_path *path; - int i; int orig_level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; level = btrfs_header_level(log->node); orig_level = level; @@ -1852,45 +2308,49 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, wret = walk_down_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } wret = walk_up_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(log, path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level])); + if (ret) + goto out; if (wc->free) { struct extent_buffer *next; next = path->nodes[orig_level]; - btrfs_tree_lock(next); - clean_tree_block(trans, log, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); + if (trans) { + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); - BUG_ON(ret); + if (ret) + goto out; } } - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } +out: btrfs_free_path(path); return ret; } @@ -1915,8 +2375,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -1931,33 +2391,61 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->log_transid < transid + 2 && + } while (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])); - return 0; } -static int wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } - return 0; +} + +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); } /* @@ -1973,7 +2461,7 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, struct btrfs_log_ctx *ctx) { int index1; int index2; @@ -1981,41 +2469,52 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; + struct blk_plug plug; mutex_lock(&root->log_mutex); - index1 = root->log_transid % 2; + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(&root->log_mutex); - return 0; + return ctx->log_ret; } + ASSERT(log_transid == root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); while (1) { - unsigned long batch = root->log_batch; - if (root->log_multiple_pids) { + int batch = atomic_read(&root->log_batch); + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } wait_for_writer(trans, root); - if (batch == root->log_batch) + if (batch == atomic_read(&root->log_batch)) break; } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } - log_transid = root->log_transid; if (log_transid % 2 == 0) mark = EXTENT_DIRTY; else @@ -2024,16 +2523,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ + blk_start_plug(&plug); ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); - BUG_ON(ret); + if (ret) { + blk_finish_plug(&plug); + btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); + btrfs_set_log_full_commit(root->fs_info, trans); + mutex_unlock(&root->log_mutex); + goto out; + } btrfs_set_root_node(&log->root_item, log->node); - root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -2041,13 +2546,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); + btrfs_init_log_ctx(&root_log_ctx); + mutex_lock(&log_root_tree->log_mutex); - log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); - BUG_ON(ret); mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { @@ -2056,19 +2567,48 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wake_up(&log_root_tree->log_writer_wait); } - index2 = log_root_tree->log_transid % 2; + if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + + blk_finish_plug(&plug); + btrfs_set_log_full_commit(root->fs_info, trans); + + if (ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + root_log_ctx.log_transid); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; goto out; } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); + root_log_ctx.log_transid - 1); } wait_for_writer(trans, log_root_tree); @@ -2077,28 +2617,38 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } - ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); - BUG_ON(ret); + ret = btrfs_write_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + btrfs_wait_logged_extents(log, log_transid); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); - log_root_tree->log_batch = 0; log_root_tree->log_transid++; - smp_mb(); - mutex_unlock(&log_root_tree->log_mutex); /* @@ -2108,8 +2658,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 1); - ret = 0; + ret = write_ctree_super(trans, root->fs_info->tree_root, 1); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_abort_transaction(trans, root, ret); + goto out_wake_log_root; + } mutex_lock(&root->log_mutex); if (root->last_log_commit < log_transid) @@ -2117,27 +2671,37 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); + mutex_unlock(&log_root_tree->log_mutex); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + mutex_lock(&root->log_mutex); + root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); - smp_mb(); + mutex_unlock(&root->log_mutex); + if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); - return 0; + return ret; } -/* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction - */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) { int ret; - struct btrfs_root *log; - struct key; u64 start; u64 end; struct walk_control wc = { @@ -2145,16 +2709,15 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) .process_func = process_one_buffer }; - if (!root->log_root || root->fs_info->log_root_recovering) - return 0; - - log = root->log_root; ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + NULL); if (ret) break; @@ -2162,14 +2725,38 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } - if (log->log_transid > 0) { - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); - } - root->log_root = NULL; + /* + * We may have short-circuited the log tree with the full commit logic + * and left ordered extents on our list, so clear these out to keep us + * from leaking inodes and memory. + */ + btrfs_free_logged_extents(log, 0); + btrfs_free_logged_extents(log, 1); + free_extent_buffer(log->node); kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } return 0; } @@ -2203,7 +2790,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di; struct btrfs_path *path; int ret; + int err = 0; int bytes_del = 0; + u64 dir_ino = btrfs_ino(dir); if (BTRFS_I(dir)->logged_trans < trans->transid) return 0; @@ -2216,20 +2805,39 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); - di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + if (!path) { + err = -ENOMEM; + goto out_unlock; + } + + di = btrfs_lookup_dir_item(trans, log, path, dir_ino, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } - btrfs_release_path(log, path); - di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + btrfs_release_path(path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, index, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } /* update the directory size in the log to reflect the names @@ -2238,12 +2846,16 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (bytes_del) { struct btrfs_key key; - key.objectid = dir->i_ino; + key.objectid = dir_ino; key.offset = 0; key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(log, path); + btrfs_release_path(path); ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } if (ret == 0) { struct btrfs_inode_item *item; u64 i_size; @@ -2259,14 +2871,21 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); } else ret = 0; - btrfs_release_path(log, path); + btrfs_release_path(path); } - +fail: btrfs_free_path(path); +out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 0; + } else if (ret < 0) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_log_trans(root); - return 0; + return err; } /* see comments for btrfs_del_dir_entries_in_log */ @@ -2288,9 +2907,14 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, log = root->log_root; mutex_lock(&BTRFS_I(inode)->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 0; + } else if (ret < 0 && ret != -ENOENT) + btrfs_abort_transaction(trans, root, ret); btrfs_end_log_trans(root); return ret; @@ -2318,13 +2942,14 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, else key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - BUG_ON(ret); + if (ret) + return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(log, path); + btrfs_release_path(path); return 0; } @@ -2340,45 +2965,41 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; - struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; + int err = 0; int ret; int i; int nritems; u64 first_offset = min_offset; u64 last_offset = (u64)-1; + u64 ino = btrfs_ino(inode); log = root->log_root; - max_key.objectid = inode->i_ino; - max_key.offset = (u64)-1; - max_key.type = key_type; - min_key.objectid = inode->i_ino; + min_key.objectid = ino; min_key.type = key_type; min_key.offset = min_offset; path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* * we didn't find anything from this transaction, see if there * is anything at all */ - if (ret != 0 || min_key.objectid != inode->i_ino || - min_key.type != key_type) { - min_key.objectid = inode->i_ino; + if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + min_key.objectid = ino; min_key.type = key_type; min_key.offset = (u64)-1; - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret < 0) { - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } - ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + ret = btrfs_previous_item(root, path, ino, key_type); /* if ret == 0 there are items for this type, * create a range to tell us the last key of this type. @@ -2396,7 +3017,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } /* go backward to find any previous key */ - ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + ret = btrfs_previous_item(root, path, ino, key_type); if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); @@ -2405,16 +3026,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); + if (ret) { + err = ret; + goto done; + } } } - btrfs_release_path(root, path); + btrfs_release_path(path); /* find the first key from this transaction again */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret != 0) { - WARN_ON(1); + if (WARN_ON(ret != 0)) goto done; - } /* * we have a block from this transaction, log every item in it @@ -2427,12 +3050,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, for (i = path->slots[0]; i < nritems; i++) { btrfs_item_key_to_cpu(src, &min_key, i); - if (min_key.objectid != inode->i_ino || - min_key.type != key_type) + if (min_key.objectid != ino || min_key.type != key_type) goto done; ret = overwrite_item(trans, log, dst_path, src, i, &min_key); - BUG_ON(ret); + if (ret) { + err = ret; + goto done; + } } path->slots[0] = nritems; @@ -2446,7 +3071,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + if (tmp.objectid != ino || tmp.type != key_type) { last_offset = (u64)-1; goto done; } @@ -2454,22 +3079,29 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); - - BUG_ON(ret); - last_offset = tmp.offset; + if (ret) + err = ret; + else + last_offset = tmp.offset; goto done; } } done: - *last_offset_ret = last_offset; - btrfs_release_path(root, path); - btrfs_release_path(log, dst_path); - - /* insert the log range keys to indicate where the log is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, - first_offset, last_offset); - BUG_ON(ret); - return 0; + btrfs_release_path(path); + btrfs_release_path(dst_path); + + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + ino, first_offset, last_offset); + if (ret) + err = ret; + } + return err; } /* @@ -2501,7 +3133,8 @@ again: ret = log_dir_items(trans, root, inode, path, dst_path, key_type, min_key, &max_key); - BUG_ON(ret); + if (ret) + return ret; if (max_key == (u64)-1) break; min_key = max_key + 1; @@ -2528,6 +3161,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key key; struct btrfs_key found_key; + int start_slot; key.objectid = objectid; key.type = max_key_type; @@ -2535,8 +3169,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - - if (ret != 1) + BUG_ON(ret == 0); /* Logic error */ + if (ret < 0) break; if (path->slots[0] == 0) @@ -2549,35 +3183,133 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, if (found_key.objectid != objectid) break; - ret = btrfs_del_item(trans, log, path); - BUG_ON(ret); - btrfs_release_path(log, path); + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], &found_key, 0, + &start_slot); + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) + break; + btrfs_release_path(path); } - btrfs_release_path(log, path); + btrfs_release_path(path); + if (ret > 0) + ret = 0; + return ret; +} + +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, 0, &token); + } else { + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); +} + +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + int ret; + + ret = btrfs_insert_empty_item(trans, log, path, + &BTRFS_I(inode)->location, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + btrfs_release_path(path); return 0; } static noinline int copy_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct inode *inode, struct btrfs_path *dst_path, - struct extent_buffer *src, + struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; unsigned long dst_offset; + struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; char *ins_data; int i; struct list_head ordered_sums; + int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = (*last_extent == 0); + bool done = false; INIT_LIST_HEAD(&ordered_sums); ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + first_key.objectid = (u64)-1; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -2587,7 +3319,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes, nr); - BUG_ON(ret); + if (ret) { + kfree(ins_data); + return ret; + } for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], @@ -2595,36 +3330,50 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); + if ((i == (nr - 1))) + last_key = ins_keys[i]; - if (inode_only == LOG_INODE_EXISTS && - ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], struct btrfs_inode_item); - btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + inode, inode_only == LOG_INODE_EXISTS); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + } - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' - */ - btrfs_set_inode_generation(dst_path->nodes[0], - inode_item, 0); + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (need_find_last_extent && + first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, struct btrfs_file_extent_item); + if (btrfs_file_extent_generation(src, extent) < trans->transid) + continue; + found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (found_type == BTRFS_FILE_EXTENT_REG) { u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); @@ -2646,32 +3395,445 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range( log->fs_info->csum_root, ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); + &ordered_sums, 0); + if (ret) { + btrfs_release_path(dst_path); + kfree(ins_data); + return ret; + } } } } btrfs_mark_buffer_dirty(dst_path->nodes[0]); - btrfs_release_path(log, dst_path); + btrfs_release_path(dst_path); kfree(ins_data); /* * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, log, sums); - BUG_ON(ret); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); list_del(&sums->list); kfree(sums); } + + if (!has_extents) + return ret; + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, i, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = offset + len; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; + return ret; +} + +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; return 0; } +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + struct extent_map *em, struct btrfs_path *path, + struct list_head *logged_list) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_ordered_extent *ordered; + struct list_head ordered_sums; + struct btrfs_map_token token; + struct btrfs_key key; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + u64 csum_offset; + u64 csum_len; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + int extent_inserted = 0; + + INIT_LIST_HEAD(&ordered_sums); + btrfs_init_map_token(&token); + + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); + if (ret) + return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + skip_csum = true; + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); + } else { + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); + if (em->block_start == EXTENT_MAP_HOLE) + skip_csum = true; + } + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, + em->start - em->orig_start, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + if (ret) { + return ret; + } + + if (skip_csum) + return 0; + + /* + * First check and see if our csums are on our outstanding ordered + * extents. + */ + list_for_each_entry(ordered, logged_list, log_list) { + struct btrfs_ordered_sum *sum; + + if (!mod_len) + break; + + if (ordered->file_offset + ordered->len <= mod_start || + mod_start + mod_len <= ordered->file_offset) + continue; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this + * ordered extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered->file_offset + ordered->len >= + mod_start + mod_len) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered->file_offset + ordered->len < + mod_start + mod_len) { + mod_len = (mod_start + mod_len) - + (ordered->file_offset + ordered->len); + mod_start = ordered->file_offset + + ordered->len; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, + &ordered->flags)) + continue; + + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } + + list_for_each_entry(sum, &ordered->list, list) { + ret = btrfs_csum_file_blocks(trans, log, sum); + if (ret) + goto unlocked; + } + + } +unlocked: + + if (!mod_len || ret) + return ret; + + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } else { + csum_offset = mod_start - em->start; + csum_len = mod_len; + } + + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; + + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); + } + + return ret; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct list_head *logged_list) +{ + struct extent_map *em, *n; + struct list_head extents; + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + u64 test_gen; + int ret = 0; + int num = 0; + + INIT_LIST_HEAD(&extents); + + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + + if (em->generation <= test_gen) + continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); + list_add_tail(&em->list, &extents); + num++; + } + + list_sort(NULL, &extents, extent_cmp); + +process: + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) { + clear_em_logging(tree, em); + free_extent_map(em); + continue; + } + + write_unlock(&tree->lock); + + ret = log_one_extent(trans, inode, root, em, path, logged_list); + write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); + } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + + btrfs_release_path(path); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -2696,35 +3858,57 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; - u32 size; + LIST_HEAD(logged_list); + u64 last_extent = 0; + int err = 0; int ret; int nritems; int ins_start_slot = 0; int ins_nr; - - log = root->log_root; + bool fast_search = false; + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } - min_key.objectid = inode->i_ino; + min_key.objectid = ino; min_key.type = BTRFS_INODE_ITEM_KEY; min_key.offset = 0; - max_key.objectid = inode->i_ino; + max_key.objectid = ino; - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + /* today the code can only do partial logging of directories */ + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; + /* Only run delayed items if we are a dir or a new file */ + if (S_ISDIR(inode->i_mode) || + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } + } + mutex_lock(&BTRFS_I(inode)->log_mutex); + btrfs_get_logged_extents(inode, &logged_list); + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -2734,29 +3918,54 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, - inode->i_ino, max_key_type); + ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; + } + + } + if (ret) { + err = ret; + goto out_unlock; } - BUG_ON(ret); path->keep_locks = 1; while (1) { ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + ret = btrfs_search_forward(root, &min_key, + path, trans->transid); if (ret != 0) break; again: /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != inode->i_ino) + if (min_key.objectid != ino) break; if (min_key.type > max_key.type) break; src = path->nodes[0]; - size = btrfs_item_size_nr(src, path->slots[0]); if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; @@ -2766,9 +3975,16 @@ again: goto next_slot; } - ret = copy_items(trans, log, dst_path, src, ins_start_slot, - ins_nr, inode_only); - BUG_ON(ret); + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; + } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -2781,43 +3997,77 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ret = 0; ins_nr = 0; } - btrfs_release_path(root, path); + btrfs_release_path(path); - if (min_key.offset < (u64)-1) + if (min_key.offset < (u64)-1) { min_key.offset++; - else if (min_key.type < (u8)-1) + } else if (min_key.type < max_key.type) { min_key.type++; - else if (min_key.objectid < (u64)-1) - min_key.objectid++; - else + min_key.offset = 0; + } else { break; + } } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, - ins_nr, inode_only); - BUG_ON(ret); + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ret = 0; ins_nr = 0; } - WARN_ON(ins_nr); + +log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); + if (fast_search) { + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list); + if (ret) { + err = ret; + goto out_unlock; + } + } else if (inode_only == LOG_INODE_ALL) { + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em, *n; + + write_lock(&tree->lock); + list_for_each_entry_safe(em, n, &tree->modified_extents, list) + list_del_init(&em->list); + write_unlock(&tree->lock); + } + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(root, path); - btrfs_release_path(log, dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } } BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; +out_unlock: + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); btrfs_free_path(dst_path); - return 0; + return err; } /* @@ -2834,6 +4084,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root; + struct dentry *old_parent = NULL; + struct inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -2853,7 +4105,14 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, } while (1) { - BTRFS_I(inode)->logged_trans = trans->transid; + /* + * If we are logging a directory then we start with our inode, + * not our parents inode, so we need to skipp setting the + * logged_trans so that further down in the log code we don't + * think this inode has already been logged. + */ + if (inode != orig_inode) + BTRFS_I(inode)->logged_trans = trans->transid; smp_mb(); if (BTRFS_I(inode)->last_unlink_trans > last_committed) { @@ -2863,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; break; } @@ -2875,41 +4133,31 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; inode = parent->d_inode; } + dput(old_parent); out: return ret; } -static int inode_in_log(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); - if (BTRFS_I(inode)->logged_trans == trans->transid && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; -} - - /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) +static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only, + struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; + struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; @@ -2920,6 +4168,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; @@ -2937,15 +4189,18 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (inode_in_log(trans, inode)) { + if (btrfs_inode_in_log(inode, trans->transid)) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } - start_log_trans(trans, root); + ret = start_log_trans(trans, root, ctx); + if (ret) + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; /* * for regular files, if its inode is already on disk, we don't @@ -2955,8 +4210,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if (S_ISREG(inode->i_mode) && BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto no_parent; + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } inode_only = LOG_INODE_EXISTS; while (1) { @@ -2970,15 +4227,26 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; } if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; } -no_parent: ret = 0; +end_trans: + dput(old_parent); + if (ret < 0) { + btrfs_set_log_full_commit(root->fs_info, trans); + ret = 1; + } + + if (ret) + btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); end_no_trans: return ret; @@ -2991,10 +4259,17 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + 0, ctx); + dput(parent); + + return ret; } /* @@ -3016,16 +4291,27 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) .stage = 0, }; - fs_info->log_root_recovering = 1; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; - trans = btrfs_start_transaction(fs_info->tree_root, 1); + fs_info->log_root_recovering = 1; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error; + } wc.trans = trans; wc.pin = 1; - walk_log_tree(trans, log_root_tree, &wc); + ret = walk_log_tree(trans, log_root_tree, &wc); + if (ret) { + btrfs_error(fs_info, ret, "Failed to pin buffers while " + "recovering log root tree."); + goto error; + } again: key.objectid = BTRFS_TREE_LOG_OBJECTID; @@ -3034,8 +4320,12 @@ again: while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) - break; + + if (ret < 0) { + btrfs_error(fs_info, ret, + "Couldn't find tree log root."); + goto error; + } if (ret > 0) { if (path->slots[0] == 0) break; @@ -3043,31 +4333,40 @@ again: } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - btrfs_release_path(log_root_tree, path); + btrfs_release_path(path); if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root_no_radix(log_root_tree, - &found_key); - BUG_ON(!log); - + log = btrfs_read_fs_root(log_root_tree, &found_key); + if (IS_ERR(log)) { + ret = PTR_ERR(log); + btrfs_error(fs_info, ret, + "Couldn't read tree log root."); + goto error; + } tmp_key.objectid = found_key.offset; tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + if (IS_ERR(wc.replay_dest)) { + ret = PTR_ERR(wc.replay_dest); + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); + btrfs_error(fs_info, ret, "Couldn't read target root " + "for tree log recovery."); + goto error; + } wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); - if (wc.stage == LOG_WALK_REPLAY_ALL) { + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - BUG_ON(ret); } key.offset = found_key.offset - 1; @@ -3076,10 +4375,13 @@ again: free_extent_buffer(log->commit_root); kfree(log); + if (ret) + goto error; + if (found_key.offset == 0) break; } - btrfs_release_path(log_root_tree, path); + btrfs_release_path(path); /* step one is to pin it all, step two is to replay just inodes */ if (wc.pin) { @@ -3096,15 +4398,22 @@ again: btrfs_free_path(path); + /* step 4: commit the transaction, which also unpins the blocks */ + ret = btrfs_commit_transaction(trans, fs_info->tree_root); + if (ret) + return ret; + free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; fs_info->log_root_recovering = 0; - - /* step 4: commit the transaction, which also unpins the blocks */ - btrfs_commit_transaction(trans, fs_info->tree_root); - kfree(log_root_tree); + return 0; +error: + if (wc.trans) + btrfs_end_transaction(wc.trans, fs_info->tree_root); + btrfs_free_path(path); + return ret; } /* @@ -3194,6 +4503,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1); + return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0776eacb508..7f5b41bd537 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,15 +19,47 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +struct btrfs_log_ctx { + int log_ret; + int log_transid; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + INIT_LIST_HEAD(&ctx->list); +} + +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -36,12 +68,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); -int btrfs_join_running_log_trans(struct btrfs_root *root); -int btrfs_end_log_trans(struct btrfs_root *root); +void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 00000000000..840a38b2778 --- /dev/null +++ b/fs/btrfs/ulist.c @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + */ + +#include <linux/slab.h> +#include "ulist.h" +#include "ctree.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * ULIST_ITER_INIT(&uiter); + * + * while ((elem = ulist_next(ulist, &uiter)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are adressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/** + * ulist_init - freshly initialize a ulist + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + INIT_LIST_HEAD(&ulist->nodes); + ulist->root = RB_ROOT; + ulist->nnodes = 0; +} + +/** + * ulist_fini - free up additionally allocated memory for the ulist + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +static void ulist_fini(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } + ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); +} + +/** + * ulist_reinit - prepare a ulist for reuse + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_fini(ulist); + ulist_init(ulist); +} + +/** + * ulist_alloc - dynamically allocate a ulist + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(gfp_t gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} + +/** + * ulist_free - free dynamically allocated ulist + * @ulist: ulist to free + * + * It is not necessary to call ulist_fini before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_fini(ulist); + kfree(ulist); +} + +static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) +{ + struct rb_node *n = ulist->root.rb_node; + struct ulist_node *u = NULL; + + while (n) { + u = rb_entry(n, struct ulist_node, rb_node); + if (u->val < val) + n = n->rb_right; + else if (u->val > val) + n = n->rb_left; + else + return u; + } + return NULL; +} + +static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) +{ + struct rb_node **p = &ulist->root.rb_node; + struct rb_node *parent = NULL; + struct ulist_node *cur = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct ulist_node, rb_node); + + if (cur->val < ins->val) + p = &(*p)->rb_right; + else if (cur->val > ins->val) + p = &(*p)->rb_left; + else + return -EEXIST; + } + rb_link_node(&ins->rb_node, parent, p); + rb_insert_color(&ins->rb_node, &ulist->root); + return 0; +} + +/** + * ulist_add - add an element to the ulist + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask) +{ + int ret; + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + if (node) { + if (old_aux) + *old_aux = node->aux; + return 0; + } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; + + node->val = val; + node->aux = aux; +#ifdef CONFIG_BTRFS_DEBUG + node->seqnum = ulist->nnodes; +#endif + + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; + + return 1; +} + +/** + * ulist_next - iterate ulist + * @ulist: ulist to iterate + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +{ + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) + return NULL; + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) + return NULL; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; +#ifdef CONFIG_BTRFS_DEBUG + uiter->i = 0; +#endif + } + node = list_entry(uiter->cur_list, struct ulist_node, list); +#ifdef CONFIG_BTRFS_DEBUG + ASSERT(node->seqnum == uiter->i); + ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); + uiter->i++; +#endif + return node; +} diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 00000000000..7f78cbf5cf4 --- /dev/null +++ b/fs/btrfs/ulist.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + * + */ + +#ifndef __ULIST__ +#define __ULIST__ + +#include <linux/list.h> +#include <linux/rbtree.h> + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + */ +struct ulist_iterator { +#ifdef CONFIG_BTRFS_DEBUG + int i; +#endif + struct list_head *cur_list; /* hint to start search */ +}; + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + u64 aux; /* auxiliary value saved along with the val */ + +#ifdef CONFIG_BTRFS_DEBUG + int seqnum; /* sequence number this node is added */ +#endif + + struct list_head list; /* used to link node */ + struct rb_node rb_node; /* used to speed up search */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + struct list_head nodes; + struct rb_root root; +}; + +void ulist_init(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(gfp_t gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) + +#endif diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c new file mode 100644 index 00000000000..f6a4c03ee7d --- /dev/null +++ b/fs/btrfs/uuid-tree.c @@ -0,0 +1,355 @@ +/* + * Copyright (C) STRATO AG 2013. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/uuid.h> +#include <asm/unaligned.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + + +static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +{ + key->type = type; + key->objectid = get_unaligned_le64(uuid); + key->offset = get_unaligned_le64(uuid + sizeof(u64)); +} + +/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + u8 type, u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + u32 item_size; + unsigned long offset; + struct btrfs_key key; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -ENOENT; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto out; + } + while (item_size) { + __le64 data; + + read_extent_buffer(eb, &data, offset, sizeof(data)); + if (le64_to_cpu(data) == subid) { + ret = 0; + break; + } + offset += sizeof(data); + item_size -= sizeof(data); + } + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid_cpu) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + __le64 subid_le; + + ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); + if (ret != -ENOENT) + return ret; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, + sizeof(subid_le)); + if (ret >= 0) { + /* Add an item for the type for the first time */ + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + } else if (ret == -EEXIST) { + /* + * An item with that type already exists. + * Extend the item and store the new subid at the end. + */ + btrfs_extend_item(uuid_root, path, sizeof(subid_le)); + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + } else if (ret < 0) { + btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d " + "(0x%016llx, 0x%016llx) type %u!", + ret, (unsigned long long)key.objectid, + (unsigned long long)key.offset, type); + goto out; + } + + ret = 0; + subid_le = cpu_to_le64(subid_cpu); + write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + u32 item_size; + unsigned long move_dst; + unsigned long move_src; + unsigned long move_len; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); + if (ret < 0) { + btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!", + ret); + goto out; + } + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size_nr(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + ret = -ENOENT; + goto out; + } + while (item_size) { + __le64 read_subid; + + read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); + if (le64_to_cpu(read_subid) == subid) + break; + offset += sizeof(read_subid); + item_size -= sizeof(read_subid); + } + + if (!item_size) { + ret = -ENOENT; + goto out; + } + + item_size = btrfs_item_size_nr(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; + } + + move_dst = offset; + move_src = offset + sizeof(subid); + move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); + memmove_extent_buffer(eb, move_dst, move_src, move_len); + btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* 1 - for the uuid item */ + trans = btrfs_start_transaction(uuid_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid); + btrfs_end_transaction(trans, uuid_root); + +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)) +{ + struct btrfs_root *root = fs_info->uuid_root; + struct btrfs_key key; + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + int slot; + u32 item_size; + unsigned long offset; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = 0; + key.offset = 0; + +again_search_slot: + path->keep_locks = 1; + ret = btrfs_search_forward(root, &key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + + while (1) { + cond_resched(); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_UUID_KEY_SUBVOL && + key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); + goto skip; + } + while (item_size) { + u8 uuid[BTRFS_UUID_SIZE]; + __le64 subid_le; + u64 subid_cpu; + + put_unaligned_le64(key.objectid, uuid); + put_unaligned_le64(key.offset, uuid + sizeof(u64)); + read_extent_buffer(leaf, &subid_le, offset, + sizeof(subid_le)); + subid_cpu = le64_to_cpu(subid_le); + ret = check_func(fs_info, uuid, key.type, subid_cpu); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_uuid_iter_rem(root, uuid, key.type, + subid_cpu); + if (ret == 0) { + /* + * this might look inefficient, but the + * justification is that it is an + * exception that check_func returns 1, + * and that in the regular case only one + * entry per UUID exists. + */ + goto again_search_slot; + } + if (ret < 0 && ret != -ENOENT) + goto out; + } + item_size -= sizeof(subid_le); + offset += sizeof(subid_le); + } + +skip: + ret = btrfs_next_item(root, path); + if (ret == 0) + continue; + else if (ret > 0) + ret = 0; + break; + } + +out: + btrfs_free_path(path); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); + return 0; +} diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h deleted file mode 100644 index 9bf3946d5ef..00000000000 --- a/fs/btrfs/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __BTRFS_VERSION_H -#define __BTRFS_VERSION_H -#define BTRFS_BUILD_VERSION "Btrfs" -#endif diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh deleted file mode 100644 index 1ca1952fd91..00000000000 --- a/fs/btrfs/version.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# determine-version -- report a useful version for releases -# -# Copyright 2008, Aron Griffis <agriffis@n01se.net> -# Copyright 2008, Oracle -# Released under the GNU GPLv2 - -v="v0.16" - -which git &> /dev/null -if [ $? == 0 ]; then - git branch >& /dev/null - if [ $? == 0 ]; then - if head=`git rev-parse --verify HEAD 2>/dev/null`; then - if tag=`git describe --tags 2>/dev/null`; then - v="$tag" - fi - - # Are there uncommitted changes? - git update-index --refresh --unmerged > /dev/null - if git diff-index --name-only HEAD | \ - grep -v "^scripts/package" \ - | read dummy; then - v="$v"-dirty - fi - fi - fi -fi - -echo "#ifndef __BUILD_VERSION" > .build-version.h -echo "#define __BUILD_VERSION" >> .build-version.h -echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h -echo "#endif" >> .build-version.h - -diff -q version.h .build-version.h >& /dev/null - -if [ $? == 0 ]; then - rm .build-version.h - exit 0 -fi - -mv .build-version.h version.h diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index aa7dc36dac7..6cb82f62cb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,56 +22,87 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/semaphore.h> #include <asm/div64.h> -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "raid56.h" #include "async-thread.h" - -struct map_lookup { - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; +#include "check-integrity.h" +#include "rcu-string.h" +#include "math.h" +#include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); - -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -void btrfs_lock_volumes(void) +static void lock_chunks(struct btrfs_root *root) { - mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->chunk_mutex); } -void btrfs_unlock_volumes(void) +static void unlock_chunks(struct btrfs_root *root) { - mutex_unlock(&uuid_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); } -static void lock_chunks(struct btrfs_root *root) +static struct btrfs_fs_devices *__alloc_fs_devices(void) { - mutex_lock(&root->fs_info->chunk_mutex); + struct btrfs_fs_devices *fs_devs; + + fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); + if (!fs_devs) + return ERR_PTR(-ENOMEM); + + mutex_init(&fs_devs->device_list_mutex); + + INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->list); + + return fs_devs; } -static void unlock_chunks(struct btrfs_root *root) +/** + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: a pointer to UUID for this FS. If NULL a new UUID is + * generated. + * + * Return: a pointer to a new &struct btrfs_fs_devices on success; + * ERR_PTR() on error. Returned struct is not linked onto any lists and + * can be destroyed with kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { - mutex_unlock(&root->fs_info->chunk_mutex); + struct btrfs_fs_devices *fs_devs; + + fs_devs = __alloc_fs_devices(); + if (IS_ERR(fs_devs)) + return fs_devs; + + if (fsid) + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + else + generate_random_uuid(fs_devs->fsid); + + return fs_devs; } static void free_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -82,13 +113,26 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } kfree(fs_devices); } -int btrfs_cleanup_fs_uuids(void) +static void btrfs_kobject_uevent(struct block_device *bdev, + enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + +void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -98,7 +142,27 @@ int btrfs_cleanup_fs_uuids(void) list_del(&fs_devices->list); free_fs_devices(fs_devices); } - return 0; +} + +static struct btrfs_device *__alloc_device(void) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_NOFS); + if (!dev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + + spin_lock_init(&dev->io_lock); + + spin_lock_init(&dev->reada_lock); + atomic_set(&dev->reada_in_flight, 0); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + + return dev; } static noinline struct btrfs_device *__find_device(struct list_head *head, @@ -126,6 +190,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, + int flush, struct block_device **bdev, + struct buffer_head **bh) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + printk(KERN_INFO "BTRFS: open %s failed\n", device_path); + goto error; + } + + if (flush) + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + ret = set_blocksize(*bdev, 4096); + if (ret) { + blkdev_put(*bdev, flags); + goto error; + } + invalidate_bdev(*bdev); + *bh = btrfs_read_dev_super(*bdev); + if (!*bh) { + ret = -EINVAL; + blkdev_put(*bdev, flags); + goto error; + } + + return 0; + +error: + *bdev = NULL; + *bh = NULL; + return ret; +} + static void requeue_list(struct btrfs_pending_bios *pending_bios, struct bio *head, struct bio *tail) { @@ -151,7 +253,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, * the list if the block device is congested. This way, multiple devices * can make progress from a single worker thread. */ -static noinline int run_scheduled_bios(struct btrfs_device *device) +static noinline void run_scheduled_bios(struct btrfs_device *device) { struct bio *pending; struct backing_dev_info *bdi; @@ -161,22 +263,26 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *cur; int again = 0; unsigned long num_run; - unsigned long num_sync_run; unsigned long batch_run = 0; unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + int sync_pending = 0; + struct blk_plug plug; + + /* + * this function runs all the bios we've collected for + * a particular device. We don't want to wander off to + * another device without first sending all of these down. + * So, setup a plug here and finish it off before we return + */ + blk_start_plug(&plug); bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - /* we want to make sure that every time we switch from the sync - * list to the normal list, we unplug - */ - num_sync_run = 0; - loop: spin_lock(&device->io_lock); @@ -222,15 +328,6 @@ loop_lock: spin_unlock(&device->io_lock); - /* - * if we're doing the regular priority list, make sure we unplug - * for any high prio bios we've sent down - */ - if (pending_bios == &device->pending_bios && num_sync_run > 0) { - num_sync_run = 0; - blk_run_backing_dev(bdi, NULL); - } - while (pending) { rmb(); @@ -250,27 +347,34 @@ loop_lock: cur = pending; pending = pending->bi_next; cur->bi_next = NULL; - atomic_dec(&fs_info->nr_async_bios); - if (atomic_read(&fs_info->nr_async_bios) < limit && + if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) - num_sync_run++; + /* + * if we're doing the sync list, record that our + * plug has some sync requests on it + * + * If we're doing the regular list and there are + * sync requests sitting around, unplug before + * we add more + */ + if (pending_bios == &device->pending_sync_bios) { + sync_pending = 1; + } else if (sync_pending) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } - submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) { - if (num_sync_run) { - blk_run_backing_dev(bdi, NULL); - num_sync_run = 0; - } + if (need_resched()) cond_resched(); - } /* * we made progress, there is more work to do and the bdi @@ -303,13 +407,8 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) { - if (num_sync_run) { - blk_run_backing_dev(bdi, NULL); - num_sync_run = 0; - } + if (need_resched()) cond_resched(); - } continue; } spin_lock(&device->io_lock); @@ -317,27 +416,18 @@ loop_lock: device->running_pending = 1; spin_unlock(&device->io_lock); - btrfs_requeue_work(&device->work); + btrfs_queue_work(fs_info->submit_workers, + &device->work); goto done; } + /* unplug every 64 requests just for good measure */ + if (batch_run % 64 == 0) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } } - if (num_sync_run) { - num_sync_run = 0; - blk_run_backing_dev(bdi, NULL); - } - /* - * IO has already been through a long path to get here. Checksumming, - * async helper threads, perhaps compression. We've done a pretty - * good job of collecting a batch of IO and should just unplug - * the device right away. - * - * This will help anyone who is waiting on the IO, they might have - * already unplugged, but managed to do so before the bio they - * cared about found its way down here. - */ - blk_run_backing_dev(bdi, NULL); - cond_resched(); if (again) goto loop; @@ -348,7 +438,7 @@ loop_lock: spin_unlock(&device->io_lock); done: - return 0; + blk_finish_plug(&plug); } static void pending_bios_fn(struct btrfs_work *work) @@ -359,27 +449,34 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ static noinline int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; + struct rcu_string *name; + int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); - char *name; fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return -ENOMEM; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); + fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + list_add(&fs_devices->list, &fs_uuids); - memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; - mutex_init(&fs_devices->device_list_mutex); + device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -389,36 +486,37 @@ static noinline int device_list_add(const char *path, if (fs_devices->opened) return -EBUSY; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(NULL, &devid, + disk_super->dev_item.uuid); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return -ENOMEM; + return PTR_ERR(device); } - device->devid = devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE); - device->barriers = 1; - spin_lock_init(&device->io_lock); - device->name = kstrdup(path, GFP_NOFS); - if (!device->name) { + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { kfree(device); return -ENOMEM; } - INIT_LIST_HEAD(&device->dev_alloc_list); + rcu_assign_pointer(device->name, name); mutex_lock(&fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_devices->devices); + list_add_rcu(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); + ret = 1; device->fs_devices = fs_devices; - fs_devices->num_devices++; - } else if (strcmp(device->name, path)) { - name = kstrdup(path, GFP_NOFS); + } else if (!device->name || strcmp(device->name->str, path)) { + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; - kfree(device->name); - device->name = name; + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } } if (found_transid > fs_devices->latest_trans) { @@ -426,7 +524,8 @@ static noinline int device_list_add(const char *path, fs_devices->latest_trans = found_transid; } *fs_devices_ret = fs_devices; - return 0; + + return ret; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -435,85 +534,135 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return ERR_PTR(-ENOMEM); + fs_devices = alloc_fs_devices(orig->fsid); + if (IS_ERR(fs_devices)) + return fs_devices; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); - INIT_LIST_HEAD(&fs_devices->list); - mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; - memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + fs_devices->total_devices = orig->total_devices; - mutex_lock(&orig->device_list_mutex); + /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) - goto error; + struct rcu_string *name; - device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) { - kfree(device); + device = btrfs_alloc_device(NULL, &orig_dev->devid, + orig_dev->uuid); + if (IS_ERR(device)) goto error; - } - device->devid = orig_dev->devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); - device->barriers = 1; - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_list); - INIT_LIST_HEAD(&device->dev_alloc_list); + /* + * This is ok to do without rcu read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); + } list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: - mutex_lock(&fs_devices->device_list_mutex); + /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!device->is_tgtdev_for_dev_replace && + (!latest_transid || + device->generation > latest_transid)) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; fs_devices->open_devices--; } if (device->writeable) { list_del_init(&device->dev_alloc_list); device->writeable = 0; - fs_devices->rw_devices--; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; - kfree(device->name); + rcu_string_free(device->name); kfree(device); } - mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->seed) { fs_devices = fs_devices->seed; goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); - return 0; +} + +static void __free_device(struct work_struct *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, rcu_work); + + if (device->bdev) + blkdev_put(device->bdev, device->mode); + + rcu_string_free(device->name); + kfree(device); +} + +static void free_device(struct rcu_head *head) +{ + struct btrfs_device *device; + + device = container_of(head, struct btrfs_device, rcu); + + INIT_WORK(&device->rcu_work, __free_device); + schedule_work(&device->rcu_work); } static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) @@ -523,20 +672,43 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (--fs_devices->opened > 0) return 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) fs_devices->open_devices--; - } - if (device->writeable) { + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } - device->bdev = NULL; - device->writeable = 0; - device->in_fs_metadata = 0; + if (device->can_discard) + fs_devices->num_can_discard--; + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); } + mutex_unlock(&fs_devices->device_list_mutex); + WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; @@ -564,12 +736,19 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } + /* + * Wait for rcu kworkers under __btrfs_close_devices + * to finish all blkdev_puts so device is really + * free when umount is done. + */ + rcu_barrier(); return ret; } static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { + struct request_queue *q; struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; @@ -582,22 +761,18 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; + flags |= FMODE_EXCL; + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) continue; - bdev = open_bdev_exclusive(device->name, flags, holder); - if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name); - goto error; - } - set_blocksize(bdev, 4096); - - bh = btrfs_read_dev_super(bdev); - if (!bh) - goto error_close; + /* Just open everything we can; ignore failures here */ + if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh)) + continue; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -622,6 +797,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, seeding = 0; } + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + device->can_discard = 1; + fs_devices->num_can_discard++; + } + device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; @@ -630,22 +811,22 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable) { + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); } + brelse(bh); continue; error_brelse: brelse(bh); -error_close: - close_bdev_exclusive(bdev, FMODE_READ); -error: + blkdev_put(bdev, flags); continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; @@ -674,109 +855,290 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; struct block_device *bdev; - struct buffer_head *bh; - int ret; + struct page *page; + void *p; + int ret = -EINVAL; u64 devid; u64 transid; + u64 total_devices; + u64 bytenr; + pgoff_t index; + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + bytenr = btrfs_sb_offset(0); + flags |= FMODE_EXCL; mutex_lock(&uuid_mutex); - bdev = open_bdev_exclusive(path, flags, holder); + bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - ret = set_blocksize(bdev, 4096); - if (ret) - goto error_close; - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EIO; - goto error_close; - } - disk_super = (struct btrfs_super_block *)bh->b_data; + /* make sure our super fits in the device */ + if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + goto error_bdev_put; + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + goto error_bdev_put; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_CACHE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) + goto error_bdev_put; + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_NOFS); + + if (IS_ERR_OR_NULL(page)) + goto error_bdev_put; + + p = kmap(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) + goto error_unmap; + devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); - if (disk_super->label[0]) - printk(KERN_INFO "device label %s ", disk_super->label); - else { - /* FIXME, make a readl uuid parser */ - printk(KERN_INFO "device fsid %llx-%llx ", - *(unsigned long long *)disk_super->fsid, - *(unsigned long long *)(disk_super->fsid + 8)); - } - printk(KERN_CONT "devid %llu transid %llu %s\n", - (unsigned long long)devid, (unsigned long long)transid, path); + total_devices = btrfs_super_num_devices(disk_super); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } - brelse(bh); -error_close: - close_bdev_exclusive(bdev, flags); + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } + if (!ret && fs_devices_ret) + (*fs_devices_ret)->total_devices = total_devices; + +error_unmap: + kunmap(page); + page_cache_release(page); + +error_bdev_put: + blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int contains_pending_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 *start, u64 len) +{ + struct extent_map *em; + int ret = 0; + + list_for_each_entry(em, &trans->transaction->pending_chunks, list) { + struct map_lookup *map; + int i; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev != device) + continue; + if (map->stripes[i].physical >= *start + len || + map->stripes[i].physical + em->orig_block_len <= + *start) + continue; + *start = map->stripes[i].physical + + em->orig_block_len; + ret = 1; + } + } + + return ret; +} + + /* + * find_free_dev_extent - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + max_hole_start = search_start; + max_hole_size = 0; + hole_size = 0; + + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { + ret = -ENOSPC; + goto out; + } - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) - search_start = max(root->fs_info->alloc_start, search_start); + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -785,24 +1147,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -810,48 +1157,82 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (hole_size > *max_avail) - *max_avail = hole_size; + /* + * Have to check before we set max_hole_start, otherwise + * we could end up sending back this offset anyway. + */ + if (contains_pending_extent(trans, device, + &search_start, + hole_size)) + hole_size = 0; + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation + + /* + * At this point, search_start should be the end of + * allocated dev extents, and when shrinking the device, + * search_end may be smaller than search_start. */ - BUG_ON(*start < search_start); + if (search_end > search_start) + hole_size = search_end - search_start; - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + if (contains_pending_extent(trans, device, &search_start, hole_size)) { + btrfs_release_path(path); + goto again; + } + + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); + *start = max_hole_start; + if (len) + *len = max_hole_size; return ret; } @@ -874,39 +1255,52 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); - ret = 0; + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + } else { + btrfs_error(root->fs_info, ret, "Slot search failed"); + goto out; } - BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to remove dev extent item"); + } +out: btrfs_free_path(path); return ret; } -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes) +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) { int ret; struct btrfs_path *path; @@ -916,6 +1310,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; WARN_ON(!device->in_fs_metadata); + WARN_ON(device->is_tgtdev_for_dev_replace); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -925,7 +1320,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -935,67 +1331,42 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), - BTRFS_UUID_SIZE); + btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); +out: btrfs_free_path(path); return ret; } -static noinline int find_next_chunk(struct btrfs_root *root, - u64 objectid, u64 *offset) +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_key found_key; - - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; - ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - *offset = 0; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != objectid) - *offset = 0; - else { - chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_chunk); - *offset = found_key.offset + - btrfs_chunk_length(path->nodes[0], chunk); - } + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; } - ret = 0; -error: - btrfs_free_path(path); + read_unlock(&em_tree->lock); + return ret; } -static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +static noinline int find_next_devid(struct btrfs_fs_info *fs_info, + u64 *devid_ret) { int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1004,20 +1375,21 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) key.type = BTRFS_DEV_ITEM_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ - ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + ret = btrfs_previous_item(fs_info->chunk_root, path, + BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); if (ret) { - *objectid = 1; + *devid_ret = 1; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - *objectid = found_key.offset + 1; + *devid_ret = found_key.offset + 1; } ret = 0; error: @@ -1029,9 +1401,9 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device) +static int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) { int ret; struct btrfs_path *path; @@ -1071,9 +1443,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_bandwidth(leaf, dev_item, 0); btrfs_set_device_start_offset(leaf, dev_item, 0); - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - ptr = (unsigned long)btrfs_device_fsid(dev_item); + ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -1083,6 +1455,22 @@ out: return ret; } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (!filp) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + static int btrfs_rm_dev_item(struct btrfs_root *root, struct btrfs_device *device) { @@ -1097,7 +1485,11 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1129,32 +1521,51 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct block_device *bdev; struct buffer_head *bh = NULL; struct btrfs_super_block *disk_super; + struct btrfs_fs_devices *cur_devices; u64 all_avail; u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; + bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; goto out; } @@ -1164,40 +1575,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device = NULL; devices = &root->fs_info->fs_devices->devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + /* + * It is safe to read the devices since the volume_mutex + * is held. + */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (tmp->in_fs_metadata && + !tmp->is_tgtdev_for_dev_replace && + !tmp->bdev) { device = tmp; break; } } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); bdev = NULL; bh = NULL; disk_super = NULL; if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; goto out; } } else { - bdev = open_bdev_exclusive(device_path, FMODE_READ, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_WRITE | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) goto out; - } - - set_blocksize(bdev, 4096); - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EIO; - goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root, devid, dev_uuid, + device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); if (!device) { ret = -ENOENT; @@ -1205,38 +1612,66 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } + if (device->is_tgtdev_for_dev_replace) { + ret = BTRFS_ERROR_DEV_TGT_REPLACE; + goto error_brelse; + } + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto error_brelse; } if (device->writeable) { + lock_chunks(root); list_del_init(&device->dev_alloc_list); + unlock_chunks(root); root->fs_info->fs_devices->rw_devices--; + clear_super = true; } + mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + mutex_lock(&uuid_mutex); if (ret) - goto error_brelse; + goto error_undo; + /* + * TODO: the superblock still includes this device in its num_devices + * counter although write_all_supers() is not locked out. This + * could give a filesystem state which requires a degraded mount. + */ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; + + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); device->in_fs_metadata = 0; + btrfs_scrub_cancel_dev(root->fs_info, device); /* * the device list mutex makes sure that we don't change * the device list while someone else is writing out all - * the device supers. + * the device supers. Whoever is writing all supers, should + * lock the device list mutex before getting the number of + * devices in the super block (super_copy). Conversely, + * whoever updates the number of devices in the super block + * (super_copy) should hold the device list mutex. */ + + cur_devices = device->fs_devices; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - list_del_init(&device->dev_list); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + list_del_rcu(&device->dev_list); device->fs_devices->num_devices--; + device->fs_devices->total_devices--; + + if (device->missing) + root->fs_info->fs_devices->missing_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); @@ -1246,66 +1681,229 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); - device->bdev = NULL; device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); } - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + call_rcu(&device->rcu, free_device); + + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - if (device->fs_devices->open_devices == 0) { + if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; fs_devices = root->fs_info->fs_devices; while (fs_devices) { - if (fs_devices->seed == device->fs_devices) + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; break; + } fs_devices = fs_devices->seed; } - fs_devices->seed = device->fs_devices->seed; - device->fs_devices->seed = NULL; - __btrfs_close_devices(device->fs_devices); - free_fs_devices(device->fs_devices); + cur_devices->seed = NULL; + lock_chunks(root); + __btrfs_close_devices(cur_devices); + unlock_chunks(root); + free_fs_devices(cur_devices); } + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + /* * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (device->writeable) { + if (clear_super && disk_super) { + u64 bytenr; + int i; + /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } - kfree(device->name); - kfree(device); ret = 0; + if (bdev) { + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + error_brelse: brelse(bh); -error_close: if (bdev) - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + lock_chunks(root); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + unlock_chunks(root); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; +} + +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + + list_del_rcu(&srcdev->dev_list); + list_del_rcu(&srcdev->dev_alloc_list); + fs_info->fs_devices->num_devices--; + if (srcdev->missing) { + fs_info->fs_devices->missing_devices--; + fs_info->fs_devices->rw_devices++; + } + if (srcdev->can_discard) + fs_info->fs_devices->num_can_discard--; + if (srcdev->bdev) { + fs_info->fs_devices->open_devices--; + + /* zero out the old super */ + btrfs_scratch_superblock(srcdev); + } + + call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + struct btrfs_device *next_device; + + WARN_ON(!tgtdev); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + if (tgtdev->bdev) { + btrfs_scratch_superblock(tgtdev); + fs_info->fs_devices->open_devices--; + } + fs_info->fs_devices->num_devices--; + if (tgtdev->can_discard) + fs_info->fs_devices->num_can_discard++; + + next_device = list_entry(fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (tgtdev->bdev == fs_info->sb->s_bdev) + fs_info->sb->s_bdev = next_device->bdev; + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; + list_del_rcu(&tgtdev->dev_list); + + call_rcu(&tgtdev->rcu, free_device); + + mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + +static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) +{ + int ret = 0; + struct btrfs_super_block *disk_super; + u64 devid; + u8 *dev_uuid; + struct block_device *bdev; + struct buffer_head *bh; + + *device = NULL; + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, + root->fs_info->bdev_holder, 0, &bdev, &bh); + if (ret) + return ret; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, + disk_super->fsid); + brelse(bh); + if (!*device) + ret = -ENOENT; + blkdev_put(bdev, FMODE_READ); + return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device) +{ + *device = NULL; + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held by the caller. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + *device = tmp; + break; + } + } + + if (!*device) { + btrfs_err(root->fs_info, "no missing device found"); + return -ENOENT; + } + + return 0; + } else { + return btrfs_find_device_by_path(root, device_path, device); + } } /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1313,9 +1911,9 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, if (!fs_devices->seeding) return -EINVAL; - seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!seed_devices) - return -ENOMEM; + seed_devices = __alloc_fs_devices(); + if (IS_ERR(seed_devices)) + return PTR_ERR(seed_devices); old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { @@ -1330,7 +1928,11 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - list_splice_init(&fs_devices->devices, &seed_devices->devices); + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, + synchronize_rcu); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { device->fs_devices = seed_devices; @@ -1344,6 +1946,8 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, generate_random_uuid(fs_devices->fsid); memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); @@ -1391,7 +1995,7 @@ next_slot: goto error; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(root, path); + btrfs_release_path(path); continue; } @@ -1403,14 +2007,13 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); - BUG_ON(!device); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + fs_uuid); + BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, @@ -1429,19 +2032,22 @@ error: int btrfs_init_new_device(struct btrfs_root *root, char *device_path) { + struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; struct list_head *devices; struct super_block *sb = root->fs_info->sb; + struct rcu_string *name; u64 total_bytes; int seeding_dev = 0; int ret = 0; if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) - return -EINVAL; + return -EROFS; - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1452,48 +2058,49 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); goto error; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(root->fs_info, NULL, NULL); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - ret = -ENOMEM; + ret = PTR_ERR(device); goto error; } - device->name = kstrdup(device_path, GFP_NOFS); - if (!device->name) { + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { kfree(device); ret = -ENOMEM; goto error; } + rcu_assign_pointer(device->name, name); - ret = find_next_devid(root, &device->devid); - if (ret) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + rcu_string_free(device->name); kfree(device); + ret = PTR_ERR(trans); goto error; } - trans = btrfs_start_transaction(root, 1); lock_chunks(root); - device->barriers = 1; + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - spin_lock_init(&device->io_lock); device->generation = trans->transid; device->io_width = root->sectorsize; device->io_align = root->sectorsize; @@ -1503,49 +2110,77 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->is_tgtdev_for_dev_replace = 0; + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; set_blocksize(device->bdev, 4096); if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); - BUG_ON(ret); + ret = btrfs_prepare_sprout(root); + BUG_ON(ret); /* -ENOMEM */ } device->fs_devices = root->fs_info->fs_devices; - /* - * we don't want write_supers to jump in here with our device - * half setup - */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); root->fs_info->fs_devices->num_devices++; root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_devices++; + if (device->can_discard) + root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } ret = btrfs_finish_sprout(trans, root); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } } /* @@ -1555,25 +2190,139 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); - btrfs_commit_transaction(trans, root); + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + ret = btrfs_commit_transaction(trans, root); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + if (ret) /* transaction commit */ + return ret; + ret = btrfs_relocate_sys_chunks(root); - BUG_ON(ret); + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to relocate sys chunks after " + "device initialization. This can be fixed " + "using the \"btrfs balance\" command."); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); return ret; + +error_trans: + unlock_chunks(root); + btrfs_end_transaction(trans, root); + rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); + kfree(device); error: - close_bdev_exclusive(bdev, 0); + blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; +} + +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out) +{ + struct request_queue *q; + struct btrfs_device *device; + struct block_device *bdev; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *devices; + struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + device->writeable = 1; + device->generation = 0; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 1; + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, 4096); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + if (device->can_discard) + fs_info->fs_devices->num_can_discard++; + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return ret; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + WARN_ON(fs_info->fs_devices->rw_devices == 0); + tgtdev->io_width = fs_info->dev_root->sectorsize; + tgtdev->io_align = fs_info->dev_root->sectorsize; + tgtdev->sector_size = fs_info->dev_root->sectorsize; + tgtdev->dev_root = fs_info->dev_root; + tgtdev->in_fs_metadata = 1; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -1626,13 +2375,14 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; if (!device->writeable) return -EACCES; - if (new_size <= device->total_bytes) + if (new_size <= device->total_bytes || + device->is_tgtdev_for_dev_replace) return -EINVAL; btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -1674,19 +2424,28 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, key.type = BTRFS_CHUNK_ITEM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - BUG_ON(ret); + if (ret < 0) + goto out; + else if (ret > 0) { /* Logic error or corruption */ + btrfs_error(root->fs_info, -ENOENT, + "Failed lookup while freeing chunk."); + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to delete chunk item."); +out: btrfs_free_path(path); - return 0; + return ret; } static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -1751,10 +2510,15 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); - BUG_ON(ret); + if (ret) + return ret; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } lock_chunks(root); @@ -1766,7 +2530,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; @@ -1785,6 +2549,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, BUG_ON(ret); + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); BUG_ON(ret); @@ -1797,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - kfree(map); - em->bdev = NULL; - /* once for the tree */ free_extent_map(em); /* once for us */ @@ -1837,7 +2600,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -1852,7 +2615,7 @@ again: chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); - btrfs_release_path(chunk_root, path); + btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(chunk_root, chunk_tree, @@ -1873,8 +2636,7 @@ again: failed = 0; retried = true; goto again; - } else if (failed && retried) { - WARN_ON(1); + } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } error: @@ -1882,42 +2644,399 @@ error: return ret; } -static u64 div_factor(u64 num, int factor) +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) { - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; } -int btrfs_balance(struct btrfs_root *dev_root) +static int del_balance_item(struct btrfs_root *root) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->profiles & chunk_type) + return 0; + + return 1; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage == 0) + user_thresh = 1; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { + factor = num_stripes / 2; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { + factor = num_stripes - 1; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { + factor = num_stripes - 2; + } else { + factor = num_stripes; + } + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_type, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; + + if (bargs->target == chunk_type) + return 1; + + return 0; +} + +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + + return 1; +} + +static int __btrfs_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free) + device->total_bytes - device->bytes_used > size_to_free || + device->is_tgtdev_for_dev_replace) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -1925,8 +3044,8 @@ int btrfs_balance(struct btrfs_root *dev_root) break; BUG_ON(ret); - trans = btrfs_start_transaction(dev_root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(dev_root, 0); + BUG_ON(IS_ERR(trans)); ret = btrfs_grow_device(trans, device, old_size); BUG_ON(ret); @@ -1936,13 +3055,32 @@ int btrfs_balance(struct btrfs_root *dev_root) /* step two, relocate all the chunks */ path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto error; + } + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -1952,40 +3090,736 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid) break; - chunk = btrfs_item_ptr(path->nodes[0], - path->slots[0], - struct btrfs_chunk); - /* chunk zero is special */ - if (found_key.offset == 0) - break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); + btrfs_release_path(path); + if (!ret) + goto loop; + + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } - btrfs_release_path(chunk_root, path); ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret && ret != -ENOSPC); + if (ret && ret != -ENOSPC) + goto error; + if (ret == -ENOSPC) { + enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: + if (found_key.offset == 0) + break; key.offset = found_key.offset - 1; } - ret = 0; + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + btrfs_info(fs_info, "%d enospc errors during balance", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + return ret; } +/** + * alloc_profile_is_valid - see if a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static int alloc_profile_is_valid(u64 flags, int extended) +{ + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) + return 0; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + /* true if exactly one bit set */ + return (flags & (flags - 1)) == 0; +} + +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + int ret; + + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + if (ret) + btrfs_std_error(fs_info, ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +} + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; + int mixed = 0; + int ret; + u64 num_devices; + unsigned seq; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -EINVAL; + goto out; + } + + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; + if (mixed && (bctl->flags & allowed)) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + btrfs_err(fs_info, "with mixed groups data and " + "metadata balance options must be the same"); + ret = -EINVAL; + goto out; + } + } + + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (num_devices > 1) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); + if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->data.target, 1) || + (bctl->data.target & ~allowed))) { + btrfs_err(fs_info, "unable to start balance with target " + "data profile %llu", + bctl->data.target); + ret = -EINVAL; + goto out; + } + if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->meta.target, 1) || + (bctl->meta.target & ~allowed))) { + btrfs_err(fs_info, + "unable to start balance with target metadata profile %llu", + bctl->meta.target); + ret = -EINVAL; + goto out; + } + if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->sys.target, 1) || + (bctl->sys.target & ~allowed))) { + btrfs_err(fs_info, + "unable to start balance with target system profile %llu", + bctl->sys.target); + ret = -EINVAL; + goto out; + } + + /* allow dup'ed data chunks only in mixed mode */ + if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { + btrfs_err(fs_info, "dup for data is not allowed"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, "force reducing metadata integrity"); + } else { + btrfs_err(fs_info, "balance will reduce metadata " + "integrity, use force if you want this"); + ret = -EINVAL; + goto out; + } + } + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + int num_tolerated_disk_barrier_failures; + u64 target = bctl->sys.target; + + num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (num_tolerated_disk_barrier_failures > 0 && + (target & + (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_AVAIL_ALLOC_BIT_SINGLE))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 && + (target & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + + fs_info->num_tolerated_disk_barrier_failures = + num_tolerated_disk_barrier_failures; + } + + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + atomic_inc(&fs_info->balance_running); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); + + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + } + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else { + kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + int ret = 0; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) { + btrfs_info(fs_info, "continuing balance"); + ret = btrfs_balance(fs_info->balance_ctl, NULL); + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + + return ret; +} + +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *tsk; + + spin_lock(&fs_info->balance_lock); + if (!fs_info->balance_ctl) { + spin_unlock(&fs_info->balance_lock); + return 0; + } + spin_unlock(&fs_info->balance_lock); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + btrfs_info(fs_info, "force skipping balance"); + return 0; + } + + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); + return PTR_ERR_OR_ZERO(tsk); +} + +int btrfs_recover_balance(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = fs_info; + bctl->flags = btrfs_balance_flags(leaf, item); + bctl->flags |= BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + +static int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + max_key.objectid = (u64)-1; + max_key.type = BTRFS_ROOT_ITEM_KEY; + max_key.offset = (u64)-1; + + path->keep_locks = 1; + + while (1) { + ret = btrfs_search_forward(root, &key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + if (trans) { + ret = btrfs_end_transaction(trans, fs_info->uuid_root); + trans = NULL; + if (ret) + break; + } + + btrfs_release_path(path); + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fs_info->uuid_root); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else + fs_info->update_uuid_tree_gen = 1; + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +/* + * Callback for btrfs_uuid_tree_iterate(). + * returns: + * 0 check succeeded, the entry is not outdated. + * < 0 if an error occured. + * > 0 if the check failed, which means the caller shall remove the entry. + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + +out: + return ret; +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); + if (ret < 0) { + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, fs_info, + BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + btrfs_abort_transaction(trans, tree_root, + PTR_ERR(uuid_root)); + return PTR_ERR(uuid_root); + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -2007,12 +3841,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; - if (new_size >= device->total_bytes) + if (device->is_tgtdev_for_dev_replace) return -EINVAL; path = btrfs_alloc_path(); @@ -2024,8 +3858,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2033,7 +3871,7 @@ again: key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { + do { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto done; @@ -2043,7 +3881,7 @@ again: goto done; if (ret) { ret = 0; - btrfs_release_path(root, path); + btrfs_release_path(path); break; } @@ -2052,7 +3890,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { - btrfs_release_path(root, path); + btrfs_release_path(path); break; } @@ -2060,14 +3898,14 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { - btrfs_release_path(root, path); + btrfs_release_path(path); break; } chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, chunk_offset); @@ -2075,8 +3913,7 @@ again: goto done; if (ret == -ENOSPC) failed++; - key.offset -= 1; - } + } while (key.offset-- > 0); if (failed && !retried) { failed = 0; @@ -2089,16 +3926,20 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } /* Shrinking succeeded, else we would be at "done". */ - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto done; } + lock_chunks(root); device->disk_total_bytes = new_size; @@ -2118,18 +3959,18 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; @@ -2142,275 +3983,470 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, return 0; } -static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, - int num_stripes, int sub_stripes) +/* + * sort the devices in descending order by max_avail, total_avail + */ +static int btrfs_cmp_device_info(const void *a, const void *b) { - if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) - return calc_size; - else if (type & BTRFS_BLOCK_GROUP_RAID10) - return calc_size * (num_stripes / sub_stripes); - else - return calc_size * num_stripes; + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) + return 1; + if (di_a->total_avail > di_b->total_avail) + return -1; + if (di_a->total_avail < di_b->total_avail) + return 1; + return 0; +} + +static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO allow them to set a preferred stripe size */ + return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + return; + + btrfs_set_fs_incompat(info, RAID56); } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) + struct btrfs_root *extent_root, u64 start, + u64 type) { struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct list_head *cur; struct map_lookup *map = NULL; struct extent_map_tree *em_tree; struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int looped = 0; + struct btrfs_device_info *devices_info = NULL; + u64 total_avail; + int num_stripes; /* total number of stripes to allocate */ + int data_stripes; /* number of stripes that count for + block group size */ + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ int ret; + u64 max_stripe_size; + u64 max_chunk_size; + u64 stripe_size; + u64 num_bytes; + u64 raid_stripe_len = BTRFS_STRIPE_LEN; + int ndevs; + int i; + int j; int index; - int stripe_len = 64 * 1024; - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } + BUG_ON(!alloc_profile_is_valid(type, 0)); + if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; - if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; - } - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; - } - if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - if (fs_devices->rw_devices < 2) - return -ENOSPC; - num_stripes = 2; - min_stripes = 2; - } - if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) - return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - min_stripes = 4; - } + index = __get_raid_index(type); + + sub_stripes = btrfs_raid_array[index].sub_stripes; + dev_stripes = btrfs_raid_array[index].dev_stripes; + devs_max = btrfs_raid_array[index].devs_max; + devs_min = btrfs_raid_array[index].devs_min; + devs_increment = btrfs_raid_array[index].devs_increment; + ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_chunk_size = 10 * calc_size; - min_stripe_size = 64 * 1024 * 1024; + max_stripe_size = 1024 * 1024 * 1024; + max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_chunk_size = 256 * 1024 * 1024; - min_stripe_size = 32 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; + max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - calc_size = 8 * 1024 * 1024; - max_chunk_size = calc_size * 2; - min_stripe_size = 1 * 1024 * 1024; + max_stripe_size = 32 * 1024 * 1024; + max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; + } else { + btrfs_err(info, "invalid chunk type 0x%llx requested", + type); + BUG_ON(1); } /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; - do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; - } - - /* we don't want tiny stripes */ - if (!looped) - calc_size = max_t(u64, min_stripe_size, calc_size); - - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; cur = fs_devices->alloc_list.next; - index = 0; - - if (type & BTRFS_BLOCK_GROUP_DUP) - min_free = calc_size * 2; - else - min_free = calc_size; /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already + * in the first pass through the devices list, we gather information + * about the available holes on each device. */ - if (!looped) - min_free += 1024 * 1024; + ndevs = 0; + while (cur != &fs_devices->alloc_list) { + struct btrfs_device *device; + u64 max_avail; + u64 dev_offset; - INIT_LIST_HEAD(&private_devs); - while (index < num_stripes) { device = list_entry(cur, struct btrfs_device, dev_alloc_list); - BUG_ON(!device->writeable); + + cur = cur->next; + + if (!device->writeable) { + WARN(1, KERN_ERR + "BTRFS: read-only device in alloc_list\n"); + continue; + } + + if (!device->in_fs_metadata || + device->is_tgtdev_for_dev_replace) + continue; + if (device->total_bytes > device->bytes_used) - avail = device->total_bytes - device->bytes_used; + total_avail = device->total_bytes - device->bytes_used; else - avail = 0; - cur = cur->next; + total_avail = 0; - if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); - if (ret == 0) { - list_move_tail(&device->dev_alloc_list, - &private_devs); - map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; - index++; - if (type & BTRFS_BLOCK_GROUP_DUP) { - map->stripes[index].dev = device; - map->stripes[index].physical = - dev_offset + calc_size; - index++; - } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; - if (cur == &fs_devices->alloc_list) + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; + + ret = find_free_dev_extent(trans, device, + max_stripe_size * dev_stripes, + &dev_offset, &max_avail); + if (ret && ret != -ENOSPC) + goto error; + + if (ret == 0) + max_avail = max_stripe_size * dev_stripes; + + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + continue; + + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); break; - } - list_splice(&private_devs, &fs_devices->alloc_list); - if (index < num_stripes) { - if (index >= min_stripes) { - num_stripes = index; - if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes /= sub_stripes; - num_stripes *= sub_stripes; - } - looped = 1; - goto again; } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - goto again; + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + + /* round down to number of usable stripes */ + ndevs -= ndevs % devs_increment; + + if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + ret = -ENOSPC; + goto error; + } + + if (devs_max && ndevs > devs_max) + ndevs = devs_max; + /* + * the primary goal is to maximize the number of stripes, so use as many + * devices as possible, even if the stripes are not maximum sized. + */ + stripe_size = devices_info[ndevs-1].max_avail; + num_stripes = ndevs * dev_stripes; + + /* + * this will have to be fixed for RAID1 and RAID10 over + * more drives + */ + data_stripes = num_stripes / ncopies; + + if (type & BTRFS_BLOCK_GROUP_RAID5) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 1; + } + if (type & BTRFS_BLOCK_GROUP_RAID6) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 2; + } + + /* + * Use the number of data stripes to figure out how big this chunk + * is really going to be in terms of logical address space, + * and compare that answer with the max chunk size + */ + if (stripe_size * data_stripes > max_chunk_size) { + u64 mask = (1ULL << 24) - 1; + stripe_size = max_chunk_size; + do_div(stripe_size, data_stripes); + + /* bump the answer up to a 16MB boundary */ + stripe_size = (stripe_size + mask) & ~mask; + + /* but don't go higher than the limits we found + * while searching for free extents + */ + if (stripe_size > devices_info[ndevs-1].max_avail) + stripe_size = devices_info[ndevs-1].max_avail; + } + + do_div(stripe_size, dev_stripes); + + /* align to BTRFS_STRIPE_LEN */ + do_div(stripe_size, raid_stripe_len); + stripe_size *= raid_stripe_len; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; + + for (i = 0; i < ndevs; ++i) { + for (j = 0; j < dev_stripes; ++j) { + int s = i * dev_stripes + j; + map->stripes[s].dev = devices_info[i].dev; + map->stripes[s].physical = devices_info[i].dev_offset + + j * stripe_size; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = raid_stripe_len; + map->io_align = raid_stripe_len; + map->io_width = raid_stripe_len; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; - *map_ret = map; - *stripe_size = calc_size; - *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + num_bytes = stripe_size * data_stripes; - em = alloc_extent_map(GFP_NOFS); + trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); + + em = alloc_extent_map(); if (!em) { kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = start; - em->len = *num_bytes; + em->len = num_bytes; em->block_start = 0; em->block_len = em->len; + em->orig_block_len = stripe_size; em_tree = &extent_root->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); + if (!ret) { + list_add_tail(&em->list, &trans->transaction->pending_chunks); + atomic_inc(&em->refs); + } write_unlock(&em_tree->lock); - BUG_ON(ret); - free_extent_map(em); + if (ret) { + free_extent_map(em); + goto error; + } ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, *num_bytes); - BUG_ON(ret); - - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; + start, num_bytes); + if (ret) + goto error_del_extent; - ret = btrfs_alloc_dev_extent(trans, device, - info->chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, dev_offset, calc_size); - BUG_ON(ret); - index++; - } + free_extent_map(em); + check_raid56_incompat_flag(extent_root->fs_info, type); + kfree(devices_info); return 0; + +error_del_extent: + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); +error: + kfree(devices_info); + return ret; } -static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, - struct map_lookup *map, u64 chunk_offset, - u64 chunk_size, u64 stripe_size) + u64 chunk_offset, u64 chunk_size) { - u64 dev_offset; struct btrfs_key key; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - size_t item_size = btrfs_chunk_item_size(map->num_stripes); - int index = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + u64 dev_offset; + u64 stripe_size; + int i = 0; int ret; + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(extent_root->fs_info, "unable to find logical " + "%Lu len %Lu", chunk_offset, chunk_size); + return -EINVAL; + } + + if (em->start != chunk_offset || em->len != chunk_size) { + btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, + chunk_size, em->start, em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) - return -ENOMEM; + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); - BUG_ON(ret); - index++; + if (ret) + goto out; + ret = btrfs_alloc_dev_extent(trans, device, + chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_offset, dev_offset, + stripe_size); + if (ret) + goto out; } - index = 0; + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + stripe = &chunk->stripe; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; - index++; } btrfs_set_stack_chunk_length(chunk, chunk_size); @@ -2428,15 +4464,19 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - BUG_ON(ret); - - if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + /* + * TODO: Cleanup of inserted chunk root in case of + * failure. + */ + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); - BUG_ON(ret); } + +out: kfree(chunk); - return 0; + free_extent_map(em); + return ret; } /* @@ -2450,26 +4490,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type) { u64 chunk_offset; - u64 chunk_size; - u64 stripe_size; - struct map_lookup *map; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - int ret; - ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &chunk_offset); - if (ret) - return ret; - - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, type); - if (ret) - return ret; - - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - BUG_ON(ret); - return 0; + chunk_offset = find_next_chunk(extent_root->fs_info); + return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, @@ -2478,60 +4501,32 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, { u64 chunk_offset; u64 sys_chunk_offset; - u64 chunk_size; - u64 sys_chunk_size; - u64 stripe_size; - u64 sys_stripe_size; u64 alloc_profile; - struct map_lookup *map; - struct map_lookup *sys_map; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; int ret; - ret = find_next_chunk(fs_info->chunk_root, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - BUG_ON(ret); - - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, alloc_profile); - BUG_ON(ret); - - sys_chunk_offset = chunk_offset + chunk_size; - - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + chunk_offset = find_next_chunk(fs_info); + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, + alloc_profile); + if (ret) + return ret; - ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, - &sys_chunk_size, &sys_stripe_size, - sys_chunk_offset, alloc_profile); - BUG_ON(ret); + sys_chunk_offset = find_next_chunk(root->fs_info); + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + alloc_profile); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - BUG_ON(ret); - - /* - * Modifying chunk tree needs allocating new blocks from both - * system block group and metadata block group. So we only can - * do operations require modifying the chunk tree after both - * block groups were created. - */ - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - BUG_ON(ret); - - ret = __finish_chunk_alloc(trans, extent_root, sys_map, - sys_chunk_offset, sys_chunk_size, - sys_stripe_size); - BUG_ON(ret); - return 0; + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: + return ret; } int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) @@ -2566,7 +4561,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) void btrfs_mapping_init(struct btrfs_mapping_tree *tree) { - extent_map_tree_init(&tree->map_tree, GFP_NOFS); + extent_map_tree_init(&tree->map_tree); } void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) @@ -2581,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) write_unlock(&tree->map_tree.lock); if (!em) break; - kfree(em->bdev); /* once for us */ free_extent_map(em); /* once for the tree */ @@ -2589,8 +4583,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; @@ -2599,152 +4594,388 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); read_unlock(&em_tree->lock); - BUG_ON(!em); - BUG_ON(em->start > logical || em->start + em->len < logical); + /* + * We could return errors for these cases, but that could get ugly and + * we'd probably do the same thing which is just not do anything else + * and exit, so return 1 so the callers don't try to use other copies. + */ + if (!em) { + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, + logical+len); + return 1; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " + "%Lu-%Lu", logical, logical+len, em->start, + em->start + em->len); + free_extent_map(em); + return 1; + } + map = (struct map_lookup *)em->bdev; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + ret++; + btrfs_dev_replace_unlock(&fs_info->dev_replace); + return ret; } -static int find_live_mirror(struct map_lookup *map, int first, int num, - int optimal) +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + unsigned long len = root->sectorsize; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + len = map->stripe_len * nr_data_stripes(map); + } + free_extent_map(em); + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, int num, + int optimal, int dev_replace_is_ongoing) { int i; - if (map->stripes[optimal].dev->bdev) - return optimal; - for (i = first; i < first + num; i++) { - if (map->stripes[i].dev->bdev) - return i; + int tolerance; + struct btrfs_device *srcdev; + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[optimal].dev->bdev && + (tolerance || map->stripes[optimal].dev != srcdev)) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } } + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ return optimal; } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, - int mirror_num, struct page *unplug_page) + struct btrfs_bio **bbio_ret, + int mirror_num, u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; + u64 stripe_end_offset; u64 stripe_nr; - int stripes_allocated = 8; - int stripes_required = 1; + u64 stripe_nr_orig; + u64 stripe_nr_end; + u64 stripe_len; + u64 *raid_map = NULL; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; - - if (multi_ret && !(rw & (1 << BIO_RW))) - stripes_allocated = 1; -again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), - GFP_NOFS); - if (!multi) - return -ENOMEM; - - atomic_set(&multi->error, 0); - } + struct btrfs_bio *bbio = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + int num_alloc_stripes; + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); - if (!em && unplug_page) { - kfree(multi); - return 0; + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu len %llu", + logical, *length); + return -EINVAL; } - if (!em) { - printk(KERN_CRIT "unable to find logical %llu len %llu\n", - (unsigned long long)logical, - (unsigned long long)*length); - BUG(); + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " + "found %Lu-%Lu", logical, em->start, + em->start + em->len); + free_extent_map(em); + return -EINVAL; } - BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - - /* if our multi bio struct is too small, back off and try again */ - if (rw & (1 << BIO_RW)) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (multi_ret && (rw & (1 << BIO_RW)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(multi); - goto again; - } + stripe_len = map->stripe_len; stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, stripe_len); - stripe_offset = stripe_nr * map->stripe_len; + stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + /* if we're here for raid56, we need to know the stripe aligned start */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + raid56_full_stripe_start = offset; + + /* allow a write of a full stripe, but make sure we don't + * allow straddling of stripes + */ + do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + } + + if (rw & REQ_DISCARD) { + /* we don't discard raid56 yet */ + if (map->type & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + ret = -EOPNOTSUPP; + goto out; + } + *length = min_t(u64, em->len - offset, *length); + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len; + /* For writes to RAID[56], allow a full stripeset across all disks. + For other RAID types and for RAID[56] reads, just allow a single + stripe (on a single disk). */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + (rw & REQ_WRITE)) { + max_len = stripe_len * nr_data_stripes(map) - + (offset - raid56_full_stripe_start); + } else { + /* we limit the length of each bio to what fits in a stripe */ + max_len = stripe_len - stripe_offset; + } + *length = min_t(u64, em->len - offset, max_len); } else { *length = em->len - offset; } - if (!multi_ret && !unplug_page) + /* This is for when we're called from btrfs_merge_bio_hook() and all + it cares about is the length */ + if (!bbio_ret) goto out; + btrfs_dev_replace_lock(dev_replace); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (!dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && + dev_replace->tgtdev != NULL) { + /* + * in dev-replace case, for repair case (that's the only + * case where the mirror is selected explicitly when + * calling btrfs_map_block), blocks left of the left cursor + * can also be read from the target drive. + * For REQ_GET_READ_MIRRORS, the target drive is added as + * the last one to the array of stripes. For READ, it also + * needs to be supported using the same mirror number. + * If the requested block is not left of the left cursor, + * EIO is returned. This can happen because btrfs_num_copies() + * returns one more in the dev-replace case. + */ + u64 tmp_length = *length; + struct btrfs_bio *tmp_bbio = NULL; + int tmp_num_stripes; + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + logical, &tmp_length, &tmp_bbio, 0, NULL); + if (ret) { + WARN_ON(tmp_bbio != NULL); + goto out; + } + + tmp_num_stripes = tmp_bbio->num_stripes; + if (mirror_num > tmp_num_stripes) { + /* + * REQ_GET_READ_MIRRORS does not contain this + * mirror, that means that the requested area + * is not left of the left cursor + */ + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + /* + * process the rest of the function using the mirror_num + * of the source drive. Therefore look it up first. + * At the end, patch the device pointer to the one of the + * target drive. + */ + for (i = 0; i < tmp_num_stripes; i++) { + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + tmp_bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = + tmp_bbio->stripes[i].physical; + } + } + + if (found) { + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; + } else { + WARN_ON(1); + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + kfree(tmp_bbio); + } else if (mirror_num > map->num_stripes) { + mirror_num = 0; + } + num_stripes = 1; stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (unplug_page || (rw & (1 << BIO_RW))) + stripe_nr_orig = stripe_nr; + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); + do_div(stripe_nr_end, map->stripe_len); + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + *length); + + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->num_stripes, + stripe_nr_end - stripe_nr_orig); + stripe_index = do_div(stripe_nr, map->num_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; else { - stripe_index = find_live_mirror(map, 0, + stripe_index = find_live_mirror(fs_info, map, 0, map->num_stripes, - current->pid % map->num_stripes); + current->pid % map->num_stripes, + dev_replace_is_ongoing); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (1 << BIO_RW)) + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -2752,14 +4983,81 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (unplug_page || (rw & (1 << BIO_RW))) + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) num_stripes = map->sub_stripes; + else if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->sub_stripes * + (stripe_nr_end - stripe_nr_orig), + map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { - stripe_index = find_live_mirror(map, stripe_index, + int old_stripe_index = stripe_index; + stripe_index = find_live_mirror(fs_info, map, + stripe_index, map->sub_stripes, stripe_index + - current->pid % map->sub_stripes); + current->pid % map->sub_stripes, + dev_replace_is_ongoing); + mirror_num = stripe_index - old_stripe_index + 1; + } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) + && raid_map_ret) { + int i, rot; + + /* push stripe_nr back to the start of the full stripe */ + stripe_nr = raid56_full_stripe_start; + do_div(stripe_nr, stripe_len); + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = nr_parity_stripes(map); + + raid_map = kmalloc_array(num_stripes, sizeof(u64), + GFP_NOFS); + if (!raid_map) { + ret = -ENOMEM; + goto out; + } + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); } } else { /* @@ -2768,43 +5066,229 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); - for (i = 0; i < num_stripes; i++) { - if (unplug_page) { - struct btrfs_device *device; - struct backing_dev_info *bdi; - - device = map->stripes[stripe_index].dev; - if (device->bdev) { - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, unplug_page); - } - } else { - multi->stripes[i].physical = + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_alloc_stripes <<= 1; + if (rw & REQ_GET_READ_MIRRORS) + num_alloc_stripes++; + } + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); + if (!bbio) { + kfree(raid_map); + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + + if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else + bbio->stripes[i].length = *length; + + stripe_index++; + if (stripe_index == map->num_stripes) { + /* This could only happen for RAID0/10 */ + stripe_index = 0; + stripe_nr++; + } + } + } else { + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; + } + } + + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; } - stripe_index++; } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + dev_replace->tgtdev != NULL) { + int index_where_to_add; + u64 srcdev_devid = dev_replace->srcdev->devid; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk + * to the new disk takes place at run time while the + * filesystem is mounted writable, the regular write + * operations to the old disk have to be duplicated to go + * to the new disk as well. + * Note that device->missing is handled by the caller, and + * that the write to the old disk is already set up in the + * stripes array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + index_where_to_add++; + max_errors++; + } + } + num_stripes = index_where_to_add; + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + dev_replace->tgtdev != NULL) { + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can + * also be used to read data in case it is needed to repair + * a corrupt block elsewhere. This is possible if the + * requested area is left of the left cursor. In this area, + * the target drive is a full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + u64 length = map->stripe_len; + + if (physical_of_found + length <= + dev_replace->cursor_left) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + + num_stripes++; + } + } + } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; + + /* + * this is the case that REQ_READ && dev_replace_is_ongoing && + * mirror_num == num_stripes + 1 && dev_replace target drive is + * available as a mirror + */ + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { + WARN_ON(num_stripes > 1); + bbio->stripes[0].dev = dev_replace->tgtdev; + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; + bbio->mirror_num = map->num_stripes + 1; + } + if (raid_map) { + sort_parity_stripes(bbio, raid_map); + *raid_map_ret = raid_map; } out: + if (dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); free_extent_map(em); - return 0; + return ret; } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, mirror_num, NULL); } @@ -2819,23 +5303,42 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); - BUG_ON(!buf); + BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { if (devid && map->stripes[i].dev->devid != devid) @@ -2852,8 +5355,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -2867,44 +5373,70 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; } -int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, - u64 logical, struct page *page) +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - u64 length = PAGE_CACHE_SIZE; - return __btrfs_map_block(map_tree, READ, logical, &length, - NULL, 0, page); + if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) + bio_endio_nodec(bio, err); + else + bio_endio(bio, err); + kfree(bbio); } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; - if (err) - atomic_inc(&multi->error); + if (err) { + atomic_inc(&bbio->error); + if (err == -EIO || err == -EREMOTEIO) { + unsigned int stripe_index = + btrfs_io_bio(bio)->stripe_index; + + BUG_ON(stripe_index >= bbio->num_stripes); + dev = bbio->stripes[stripe_index].dev; + if (dev->bdev) { + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } + } + } - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + btrfs_bio_counter_dec(bbio->fs_info); + + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is - * beyond the tolerance of the multi-bio + * beyond the tolerance of the btrfs bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; - } else if (err) { + } else { /* * this bio is actually up to date, we didn't * go over the max number of errors @@ -2912,21 +5444,13 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); - bio_endio(bio, err); + btrfs_end_bbio(bbio, bio, err); } else if (!is_orig_bio) { bio_put(bio); } } -struct async_sched { - struct bio *bio; - int rw; - struct btrfs_fs_info *info; - struct btrfs_work work; -}; - /* * see run_scheduled_bios for a description of why bios are collected for * async submit. @@ -2934,19 +5458,24 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline int schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio) +static noinline void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; + if (device->missing || !device->bdev) { + bio_endio(bio, -EIO); + return; + } + /* don't bother with additional async steps for reads, right now */ - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { bio_get(bio); - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); bio_put(bio); - return 0; + return; } /* @@ -2961,7 +5490,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) + if (bio->bi_rw & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -2978,85 +5507,221 @@ static noinline int schedule_bio(struct btrfs_root *root, spin_unlock(&device->io_lock); if (should_queue) - btrfs_queue_worker(&root->fs_info->submit_workers, - &device->work); + btrfs_queue_work(root->fs_info->submit_workers, + &device->work); +} + +static int bio_size_ok(struct block_device *bdev, struct bio *bio, + sector_t sector) +{ + struct bio_vec *prev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_sectors = queue_max_sectors(q); + struct bvec_merge_data bvm = { + .bi_bdev = bdev, + .bi_sector = sector, + .bi_rw = bio->bi_rw, + }; + + if (WARN_ON(bio->bi_vcnt == 0)) + return 1; + + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (bio_sectors(bio) > max_sectors) + return 0; + + if (!q->merge_bvec_fn) + return 1; + + bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) + return 0; + return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *bio, u64 physical, int dev_nr, + int rw, int async) +{ + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + + bio->bi_private = bbio; + btrfs_io_bio(bio)->stripe_index = dev_nr; + bio->bi_end_io = btrfs_end_bio; + bio->bi_iter.bi_sector = physical >> 9; +#ifdef DEBUG + { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); + } +#endif + bio->bi_bdev = dev->bdev; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + + if (async) + btrfs_schedule_bio(root, dev, rw, bio); + else + btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *first_bio, struct btrfs_device *dev, + int dev_nr, int rw, int async) +{ + struct bio_vec *bvec = first_bio->bi_io_vec; + struct bio *bio; + int nr_vecs = bio_get_nr_vecs(dev->bdev); + u64 physical = bbio->stripes[dev_nr].physical; + +again: + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); + if (!bio) + return -ENOMEM; + + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + u64 len = bio->bi_iter.bi_size; + + atomic_inc(&bbio->stripes_pending); + submit_stripe_bio(root, bbio, bio, physical, dev_nr, + rw, async); + physical += len; + goto again; + } + bvec++; + } + + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); return 0; } +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ + atomic_inc(&bbio->error); + if (atomic_dec_and_test(&bbio->stripes_pending)) { + /* Shoud be the original bio. */ + WARN_ON(bio != bbio->orig_bio); + + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + bio->bi_iter.bi_sector = logical >> 9; + + btrfs_end_bbio(bbio, bio, -EIO); + } +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { - struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; + u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; - length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; + length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, - mirror_num); - BUG_ON(ret); + btrfs_bio_counter_inc_blocked(root->fs_info); + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, + mirror_num, &raid_map); + if (ret) { + btrfs_bio_counter_dec(root->fs_info); + return ret; + } + + total_devs = bbio->num_stripes; + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + bbio->fs_info = root->fs_info; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); + + if (raid_map) { + /* In this case, map_length has been set to the length of + a single stripe; not the whole write */ + if (rw & WRITE) { + ret = raid56_parity_write(root, bio, bbio, + raid_map, map_length); + } else { + ret = raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); + } + /* + * FIXME, replace dosen't support raid56 yet, please fix + * it in the future. + */ + btrfs_bio_counter_dec(root->fs_info); + return ret; + } - total_devs = multi->num_stripes; if (map_length < length) { - printk(KERN_CRIT "mapping failed logical %llu bio len %llu " - "len %llu\n", (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); + btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; - } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; - BUG_ON(rw == WRITE && !dev->writeable); - if (dev && dev->bdev) { - bio->bi_bdev = dev->bdev; - if (async_submit) - schedule_bio(root, dev, rw, bio); - else - submit_bio(rw, bio); + dev = bbio->stripes[dev_nr].dev; + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + bbio_error(bbio, first_bio, logical); + dev_nr++; + continue; + } + + /* + * Check and see if we're ok with this bio based on it's size + * and offset with the given device. + */ + if (!bio_size_ok(dev->bdev, first_bio, + bbio->stripes[dev_nr].physical >> 9)) { + ret = breakup_stripe_bio(root, bbio, first_bio, dev, + dev_nr, rw, async_submit); + BUG_ON(ret); + dev_nr++; + continue; + } + + if (dev_nr < total_devs - 1) { + bio = btrfs_bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); /* -ENOMEM */ } else { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - bio->bi_sector = logical >> 9; - bio_endio(bio, -EIO); + bio = first_bio; + bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; } + + submit_stripe_bio(root, bbio, bio, + bbio->stripes[dev_nr].physical, dev_nr, rw, + async_submit); dev_nr++; } - if (total_devs == 1) - kfree(multi); + btrfs_bio_counter_dec(root->fs_info); return 0; } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - cur_devices = root->fs_info->fs_devices; + cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -3076,23 +5741,70 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) + device = btrfs_alloc_device(NULL, &devid, dev_uuid); + if (IS_ERR(device)) return NULL; - list_add(&device->dev_list, - &fs_devices->devices); - device->barriers = 1; - device->dev_root = root->fs_info->dev_root; - device->devid = devid; - device->work.func = pending_bios_fn; + + list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_alloc_list); - memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + + device->missing = 1; + fs_devices->missing_devices++; + return device; } +/** + * btrfs_alloc_device - allocate struct btrfs_device + * @fs_info: used only for generating a new devid, can be NULL if + * devid is provided (i.e. @devid != NULL). + * @devid: a pointer to devid for this device. If NULL a new devid + * is generated. + * @uuid: a pointer to UUID for this device. If NULL a new UUID + * is generated. + * + * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() + * on error. Returned struct is not linked onto any lists and can be + * destroyed with kfree() right away. + */ +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid) +{ + struct btrfs_device *dev; + u64 tmp; + + if (WARN_ON(!devid && !fs_info)) + return ERR_PTR(-EINVAL); + + dev = __alloc_device(); + if (IS_ERR(dev)) + return dev; + + if (devid) + tmp = *devid; + else { + int ret; + + ret = find_next_devid(fs_info, &tmp); + if (ret) { + kfree(dev); + return ERR_PTR(ret); + } + } + dev->devid = tmp; + + if (uuid) + memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); + else + generate_random_uuid(dev->uuid); + + btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); + + return dev; +} + static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -3123,7 +5835,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); } - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); if (!em) return -ENOMEM; num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3133,9 +5845,11 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -ENOMEM; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = logical; em->len = length; + em->orig_start = 0; em->block_start = 0; em->block_len = em->len; @@ -3153,10 +5867,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, - NULL); + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { - kfree(map); free_extent_map(em); return -EIO; } @@ -3164,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = add_missing_dev(root, devid, uuid); if (!map->stripes[i].dev) { - kfree(map); free_extent_map(em); return -EIO; } @@ -3173,15 +5885,15 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em); + ret = add_extent_mapping(&map_tree->map_tree, em, 0); write_unlock(&map_tree->map_tree.lock); - BUG_ON(ret); + BUG_ON(ret); /* Tree corruption */ free_extent_map(em); return 0; } -static int fill_device_from_item(struct extent_buffer *leaf, +static void fill_device_from_item(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item, struct btrfs_device *device) { @@ -3195,11 +5907,11 @@ static int fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); + device->is_tgtdev_for_dev_replace = 0; - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - - return 0; } static int open_seed_devices(struct btrfs_root *root, u8 *fsid) @@ -3207,7 +5919,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3232,8 +5944,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); - if (ret) + if (ret) { + free_fs_devices(fs_devices); goto out; + } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); @@ -3245,7 +5959,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3260,11 +5973,9 @@ static int read_one_dev(struct btrfs_root *root, u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { @@ -3273,17 +5984,25 @@ static int read_one_dev(struct btrfs_root *root, return ret; } - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device || !device->bdev) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; if (!device) { - printk(KERN_WARNING "warning devid %llu missing\n", - (unsigned long long)devid); + btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; + } else if (!device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + root->fs_info->fs_devices->missing_devices++; + device->missing = 1; } } @@ -3295,26 +6014,21 @@ static int read_one_dev(struct btrfs_root *root, } fill_device_from_item(leaf, dev_item, device); - device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } -int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) -{ - struct btrfs_dev_item *dev_item; - - dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, - dev_item); - return read_one_dev(root, buf, dev_item); -} - int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; @@ -3332,7 +6046,21 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(sb, 0); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -3383,14 +6111,18 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; - /* first we search for all of the device items, and then we - * read in all of the chunk items. This way we can create chunk - * mappings that reference all of the devices that are afound + mutex_lock(&uuid_mutex); + lock_chunks(root); + + /* + * Read all device items, and then all the chunk items. All + * device items are found before any chunk item (their object id + * is smaller than the lowest possible object id for a chunk + * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; -again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto error; @@ -3406,17 +6138,13 @@ again: break; } btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) - break; - if (found_key.type == BTRFS_DEV_ITEM_KEY) { - struct btrfs_dev_item *dev_item; - dev_item = btrfs_item_ptr(leaf, slot, + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(root, leaf, dev_item); - if (ret) - goto error; - } + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3426,13 +6154,276 @@ again: } path->slots[0]++; } - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - key.objectid = 0; - btrfs_release_path(root, path); - goto again; - } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } + +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } +} + +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int i; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_dev_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + __btrfs_reset_dev_stats(device); + device->dev_stats_valid = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_reset(device, i); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "error %d while searching for dev_stats item for device %s!\n", + ret, rcu_str_deref(device->name)); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "delete too small dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk_in_rcu(KERN_WARNING "BTRFS: " + "insert dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->dev_stats_valid || !device->dev_stats_dirty) + continue; + + ret = update_dev_stat_item(trans, dev_root, device); + if (!ret) + device->dev_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + btrfs_dev_stat_print_on_error(dev); +} + +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ + if (!dev->dev_stats_valid) + return; + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (btrfs_dev_stat_read(dev, i) != 0) + break; + if (i == BTRFS_DEV_STAT_VALUES_MAX) + return; /* all values == 0, suppress message */ + + printk_in_rcu(KERN_INFO "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); + return -ENODEV; + } else if (!dev->dev_stats_valid) { + btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); + return -ENODEV; + } else if (stats->flags & BTRFS_DEV_STATS_RESET) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_reset(dev, i); + } + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + + bh = btrfs_read_dev_super(device->bdev); + if (!bh) + return -EINVAL; + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 31b0fabdd2e..2aaa00c4781 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,8 +20,12 @@ #define __BTRFS_VOLUMES_ #include <linux/bio.h> +#include <linux/sort.h> +#include <linux/btrfs.h> #include "async-thread.h" +#define BTRFS_STRIPE_LEN (64 * 1024) + struct buffer_head; struct btrfs_pending_bios { struct bio *head; @@ -39,21 +43,22 @@ struct btrfs_device { /* WRITE_SYNC bios */ struct btrfs_pending_bios pending_sync_bios; - int running_pending; u64 generation; - - int barriers; + int running_pending; int writeable; int in_fs_metadata; + int missing; + int can_discard; + int is_tgtdev_for_dev_replace; spinlock_t io_lock; + /* the mode sent to blkdev_get */ + fmode_t mode; struct block_device *bdev; - /* the mode sent to open_bdev_exclusive */ - fmode_t mode; - char *name; + struct rcu_string *name; /* the internal btrfs device id */ u64 devid; @@ -72,17 +77,42 @@ struct btrfs_device { /* optimal io width for this device */ u32 io_width; + /* type and info about this device */ + u64 type; /* minimal io size for this device */ u32 sector_size; - /* type and info about this device */ - u64 type; /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* for sending down flush barriers */ + int nobarriers; + struct bio *flush_bio; + struct completion flush_wait; + + /* per-device scrub information */ + struct scrub_ctx *scrub_device; + struct btrfs_work work; + struct rcu_head rcu; + struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; + + + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; + int dev_stats_dirty; /* counters need to be written to disk */ + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; struct btrfs_fs_devices { @@ -94,12 +124,17 @@ struct btrfs_fs_devices { u64 num_devices; u64 open_devices; u64 rw_devices; + u64 missing_devices; u64 total_rw_bytes; + u64 num_can_discard; + u64 total_devices; struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code + * worrying about add/remove by the multi-device code. + * Scrubbing super can kick off supers writing by holding + * this mutex lock. */ struct mutex device_list_mutex; struct list_head devices; @@ -119,32 +154,143 @@ struct btrfs_fs_devices { int rotating; }; +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); +struct btrfs_io_bio { + unsigned long mirror_num; + unsigned long stripe_index; + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + u8 *csum_allocated; + btrfs_io_bio_end_io_t *end_io; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; + u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 + +struct btrfs_bio { atomic_t stripes_pending; + struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; + unsigned long flags; void *private; atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; + u64 total_avail; +}; + +struct btrfs_raid_attr { + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ +}; + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -156,32 +302,97 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit); -int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device); +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid); int btrfs_rm_device(struct btrfs_root *root, char *device_path); -int btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); -int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, - u64 logical, struct page *page); +void btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); -void btrfs_unlock_volumes(void); -void btrfs_lock_volumes(void); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); +int btrfs_recover_balance(struct btrfs_fs_info *fs_info); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats); +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +int btrfs_scratch_superblock(struct btrfs_device *device); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical); +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size); +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + dev->dev_stats_dirty = 1; +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + dev->dev_stats_dirty = 1; + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + dev->dev_stats_dirty = 1; +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, + int index) +{ + btrfs_dev_stat_set(dev, index, 0); +} #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 193b58f7d3f..ad8328d797e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,11 +22,13 @@ #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" +#include "props.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -44,7 +46,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, return -ENOMEM; /* lookup the xattr by name */ - di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, strlen(name), 0); if (!di) { ret = -ENODATA; @@ -102,48 +104,92 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - /* first lets see if we already have this xattr */ - di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, - strlen(name), -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - /* ok we already have this xattr, lets remove it */ - if (di) { - /* if we want create only exit */ - if (flags & XATTR_CREATE) { - ret = -EEXIST; + if (flags & XATTR_REPLACE) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); + if (ret) + goto out; + btrfs_release_path(path); - /* if we don't have a value then we are removing the xattr */ + /* + * remove the attribute + */ if (!value) goto out; } else { - btrfs_release_path(root, path); - - if (flags & XATTR_REPLACE) { - /* we couldn't find the attr to replace */ - ret = -ENODATA; + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; } + if (!di && !value) + goto out; + btrfs_release_path(path); } - /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, +again: + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), name, name_len, value, size); - BUG_ON(ret); + /* + * If we're setting an xattr to a new value but the new value is say + * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting + * back from split_leaf. This is because it thinks we'll be extending + * the existing item size, but we're asking for enough space to add the + * item itself. So if we get EOVERFLOW just set ret to EEXIST and let + * the rest of the function figure it out. + */ + if (ret == -EOVERFLOW) + ret = -EEXIST; + + if (ret == -EEXIST) { + if (flags & XATTR_CREATE) + goto out; + /* + * We can't use the path we already have since we won't have the + * proper locking for a delete, so release the path and + * re-lookup to delete the thing. + */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + /* Shouldn't happen but just in case... */ + btrfs_release_path(path); + goto again; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + + /* + * We have a value to set, so go back and try to insert it now. + */ + if (value) { + btrfs_release_path(path); + goto again; + } + } out: btrfs_free_path(path); return ret; } +/* + * @value: "" makes the attribute to empty, NULL removes it + */ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -154,27 +200,21 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (trans) return do_setxattr(trans, inode, name, value, size, flags); - ret = btrfs_reserve_metadata_space(root, 2); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - btrfs_set_trans_block_group(trans, inode); + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = do_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 2); + btrfs_end_transaction(trans, root); return ret; } @@ -184,21 +224,19 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; - struct btrfs_item *item; struct extent_buffer *leaf; struct btrfs_dir_item *di; - int ret = 0, slot, advance; + int ret = 0, slot; size_t total_size = 0, size_left = size; unsigned long name_ptr; size_t name_len; - u32 nritems; /* * ok we want all objects associated with this id. * NOTE: we set key.offset = 0; because we want to start with the * first xattr that we find and walk forward */ - key.objectid = inode->i_ino; + key.objectid = btrfs_ino(inode); btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); key.offset = 0; @@ -211,36 +249,25 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; + while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; /* this is where we start walking through the path */ - if (advance || slot >= nritems) { + if (slot >= btrfs_header_nritems(leaf)) { /* * if we've reached the last slot in this leaf we need * to go to the next leaf and reset everything */ - if (slot >= nritems-1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - /* - * just walking through the slots on this leaf - */ - slot++; - path->slots[0]++; - } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; - item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); /* check to make sure this item is what we want */ @@ -250,13 +277,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + goto next; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; /* we are just looking for how big our buffer needs to be */ if (!size) - continue; + goto next; if (!buffer || (name_len + 1) > size_left) { ret = -ERANGE; @@ -269,6 +298,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) size_left -= name_len + 1; buffer += name_len + 1; +next: + path->slots[0]++; } ret = total_size; @@ -282,10 +313,10 @@ err: * List of handlers for synthetic system.* attributes. All real ondisk * attributes are handled directly. */ -struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - &btrfs_xattr_acl_access_handler, - &btrfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL, }; @@ -302,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name) XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || + !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, @@ -324,6 +356,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -335,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + value, size, flags); + if (size == 0) value = ""; /* empty EA, do not remove */ @@ -344,6 +389,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, int btrfs_removexattr(struct dentry *dentry, const char *name) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -355,38 +409,44 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + NULL, 0, XATTR_REPLACE); + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static int btrfs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, &suffix, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 721efa0346e..5049608d138 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,9 +21,7 @@ #include <linux/xattr.h> -extern struct xattr_handler btrfs_xattr_acl_access_handler; -extern struct xattr_handler btrfs_xattr_acl_default_handler; -extern struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); @@ -37,6 +35,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); + struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 3e2b90eaa23..b67d8fc8127 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -32,15 +32,6 @@ #include <linux/bio.h> #include "compression.h" -/* Plan: call deflate() with avail_in == *sourcelen, - avail_out = *dstlen - 12 and flush == Z_FINISH. - If it doesn't manage to finish, call it again with - avail_in == 0 and avail_out set to the remaining 12 - bytes for it to clean up. - Q: Is 12 bytes sufficient? -*/ -#define STREAM_END_SPACE 12 - struct workspace { z_stream inf_strm; z_stream def_strm; @@ -48,172 +39,66 @@ struct workspace { struct list_head list; }; -static LIST_HEAD(idle_workspace); -static DEFINE_SPINLOCK(workspace_lock); -static unsigned long num_workspace; -static atomic_t alloc_workspace = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); - -/* - * this finds an available zlib workspace or allocates a new one - * NULL or an ERR_PTR is returned if things go bad. - */ -static struct workspace *find_zlib_workspace(void) +static void zlib_free_workspace(struct list_head *ws) { - struct workspace *workspace; - int ret; - int cpus = num_online_cpus(); - -again: - spin_lock(&workspace_lock); - if (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - num_workspace--; - spin_unlock(&workspace_lock); - return workspace; - - } - spin_unlock(&workspace_lock); - if (atomic_read(&alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(&alloc_workspace) > cpus) - schedule(); - finish_wait(&workspace_wait, &wait); - goto again; - } - atomic_inc(&alloc_workspace); - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) { - ret = -ENOMEM; - goto fail; - } - - workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); - if (!workspace->def_strm.workspace) { - ret = -ENOMEM; - goto fail; - } - workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - if (!workspace->inf_strm.workspace) { - ret = -ENOMEM; - goto fail_inflate; - } - workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->buf) { - ret = -ENOMEM; - goto fail_kmalloc; - } - return workspace; - -fail_kmalloc: - vfree(workspace->inf_strm.workspace); -fail_inflate: - vfree(workspace->def_strm.workspace); -fail: - kfree(workspace); - atomic_dec(&alloc_workspace); - wake_up(&workspace_wait); - return ERR_PTR(ret); -} + struct workspace *workspace = list_entry(ws, struct workspace, list); -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static int free_workspace(struct workspace *workspace) -{ - spin_lock(&workspace_lock); - if (num_workspace < num_online_cpus()) { - list_add_tail(&workspace->list, &idle_workspace); - num_workspace++; - spin_unlock(&workspace_lock); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; - } - spin_unlock(&workspace_lock); vfree(workspace->def_strm.workspace); vfree(workspace->inf_strm.workspace); kfree(workspace->buf); kfree(workspace); - - atomic_dec(&alloc_workspace); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; } -/* - * cleanup function for module exit - */ -static void free_workspaces(void) +static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; - while (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); - atomic_dec(&alloc_workspace); - } + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( + MAX_WBITS, MAX_MEM_LEVEL)); + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); } -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - struct workspace *workspace; char *data_in; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; - int out_written = 0; - int in_read = 0; unsigned long bytes_left; *out_pages = 0; *total_out = 0; *total_in = 0; - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -1; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "deflateInit failed\n"); - ret = -1; + printk(KERN_WARNING "BTRFS: deflateInit failed\n"); + ret = -EIO; goto out; } @@ -224,6 +109,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -233,16 +122,13 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, workspace->def_strm.avail_out = PAGE_CACHE_SIZE; workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); - out_written = 0; - in_read = 0; - while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); - ret = -1; + ret = -EIO; goto out; } @@ -250,7 +136,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -1; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this @@ -261,10 +147,14 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; @@ -298,12 +188,12 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, zlib_deflateEnd(&workspace->def_strm); if (ret != Z_STREAM_END) { - ret = -1; + ret = -EIO; goto out; } if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; + ret = -E2BIG; goto out; } @@ -319,55 +209,26 @@ out: kunmap(in_page); page_cache_release(in_page); } - free_workspace(workspace); return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) { - int ret = 0; + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; int wbits = MAX_WBITS; - struct workspace *workspace; char *data_in; size_t total_out = 0; - unsigned long page_bytes_left; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - struct page *page_out; unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long working_bytes; unsigned long pg_offset; - unsigned long start_byte; - unsigned long current_buf_start; - char *kaddr; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; data_in = kmap(pages_in[page_in_index]); workspace->inf_strm.next_in = data_in; @@ -377,8 +238,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, workspace->inf_strm.total_out = 0; workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - page_out = bvec[page_out_index].bv_page; - page_bytes_left = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -393,108 +252,30 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; - /* - * buf start is the byte offset we're of the start of - * our workspace buffer - */ - buf_start = total_out; - /* total_out is the last byte of the workspace buffer */ + buf_start = total_out; total_out = workspace->inf_strm.total_out; - working_bytes = total_out - buf_start; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - if (working_bytes == 0) { - /* we didn't make progress in this inflate - * call, we're done - */ - if (ret != Z_STREAM_END) - ret = -1; + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) break; - } - - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - goto next; - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, - bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page_out); - - pg_offset += bytes; - page_bytes_left -= bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (page_bytes_left == 0) { - page_out_index++; - if (page_out_index >= vcnt) { - ret = 0; - goto done; - } - - page_out = bvec[page_out_index].bv_page; - pg_offset = 0; - page_bytes_left = PAGE_CACHE_SIZE; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - goto next; - - /* the next page in the biovec might not - * be adjacent to the last page, but it - * might still be found inside this working - * buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + - buf_offset; - } - } - } -next: + workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; @@ -514,42 +295,28 @@ next: } } if (ret != Z_STREAM_END) - ret = -1; + ret = -EIO; else ret = 0; done: zlib_inflateEnd(&workspace->inf_strm); if (data_in) kunmap(pages_in[page_in_index]); -out: - free_workspace(workspace); return ret; } -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - struct workspace *workspace; unsigned long bytes_left = destlen; unsigned long total_out = 0; char *kaddr; - if (destlen > PAGE_CACHE_SIZE) - return -ENOMEM; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; - workspace->inf_strm.next_in = data_in; workspace->inf_strm.avail_in = srclen; workspace->inf_strm.total_in = 0; @@ -569,9 +336,8 @@ int btrfs_zlib_decompress(unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); + return -EIO; } while (bytes_left > 0) { @@ -588,7 +354,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, total_out = workspace->inf_strm.total_out; if (total_out == buf_start) { - ret = -1; + ret = -EIO; break; } @@ -604,9 +370,9 @@ int btrfs_zlib_decompress(unsigned char *data_in, PAGE_CACHE_SIZE - buf_offset); bytes = min(bytes, bytes_left); - kaddr = kmap_atomic(dest_page, KM_USER0); + kaddr = kmap_atomic(dest_page); memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); pg_offset += bytes; bytes_left -= bytes; @@ -616,17 +382,18 @@ next: } if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; + ret = -EIO; else ret = 0; zlib_inflateEnd(&workspace->inf_strm); -out: - free_workspace(workspace); return ret; } -void btrfs_zlib_exit(void) -{ - free_workspaces(); -} +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; |
