diff options
35 files changed, 2137 insertions, 1626 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c1e0b0caf9c..ecb5832c096 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -21,708 +22,313 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <linux/workqueue.h> #include "async-thread.h" +#include "ctree.h" + +#define WORK_DONE_BIT 0 +#define WORK_ORDER_DONE_BIT 1 +#define WORK_HIGH_PRIO_BIT 2 + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct __btrfs_workqueue { + struct workqueue_struct *normal_wq; + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + int max_active; + int current_max; + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; -#define WORK_QUEUED_BIT 0 -#define WORK_DONE_BIT 1 -#define WORK_ORDER_DONE_BIT 2 -#define WORK_HIGH_PRIO_BIT 3 - -/* - * container for the kthread task pointer and the list of pending work - * One of these is allocated per thread. - */ -struct btrfs_worker_thread { - /* pool we belong to */ - struct btrfs_workers *workers; - - /* list of struct btrfs_work that are waiting for service */ - struct list_head pending; - struct list_head prio_pending; - - /* list of worker threads from struct btrfs_workers */ - struct list_head worker_list; - - /* kthread */ - struct task_struct *task; +struct btrfs_workqueue { + struct __btrfs_workqueue *normal; + struct __btrfs_workqueue *high; +}; - /* number of things on the pending list */ - atomic_t num_pending; +static inline struct __btrfs_workqueue +*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, + int thresh) +{ + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - /* reference counter for this struct */ - atomic_t refs; + if (unlikely(!ret)) + return NULL; - unsigned long sequence; + ret->max_active = max_active; + atomic_set(&ret->pending, 0); + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_max = max_active; + ret->thresh = NO_THRESHOLD; + } else { + ret->current_max = 1; + ret->thresh = thresh; + } - /* protects the pending list. */ - spinlock_t lock; + if (flags & WQ_HIGHPRI) + ret->normal_wq = alloc_workqueue("%s-%s-high", flags, + ret->max_active, + "btrfs", name); + else + ret->normal_wq = alloc_workqueue("%s-%s", flags, + ret->max_active, "btrfs", + name); + if (unlikely(!ret->normal_wq)) { + kfree(ret); + return NULL; + } - /* set to non-zero when this thread is already awake and kicking */ - int working; + INIT_LIST_HEAD(&ret->ordered_list); + spin_lock_init(&ret->list_lock); + spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); + return ret; +} - /* are we currently idle */ - int idle; -}; +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -static int __btrfs_start_workers(struct btrfs_workers *workers); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); -/* - * btrfs_start_workers uses kthread_run, which can block waiting for memory - * for a very long time. It will actually throttle on page writeback, - * and so it may not make progress until after our btrfs worker threads - * process all of the pending work structs in their queue - * - * This means we can't use btrfs_start_workers from inside a btrfs worker - * thread that is used as part of cleaning dirty memory, which pretty much - * involves all of the worker threads. - * - * Instead we have a helper queue who never has more than one thread - * where we scheduler thread start operations. This worker_start struct - * is used to contain the work and hold a pointer to the queue that needs - * another worker. - */ -struct worker_start { - struct btrfs_work work; - struct btrfs_workers *queue; -}; + if (unlikely(!ret)) + return NULL; -static void start_new_worker_func(struct btrfs_work *work) -{ - struct worker_start *start; - start = container_of(work, struct worker_start, work); - __btrfs_start_workers(start->queue); - kfree(start); -} + ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + max_active, thresh); + if (unlikely(!ret->normal)) { + kfree(ret); + return NULL; + } -/* - * helper function to move a thread onto the idle list after it - * has finished some requests. - */ -static void check_idle_worker(struct btrfs_worker_thread *worker) -{ - if (!worker->idle && atomic_read(&worker->num_pending) < - worker->workers->idle_thresh / 2) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 1; - - /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move(&worker->worker_list, - &worker->workers->idle_list); + if (flags & WQ_HIGHPRI) { + ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + thresh); + if (unlikely(!ret->high)) { + __btrfs_destroy_workqueue(ret->normal); + kfree(ret); + return NULL; } - spin_unlock_irqrestore(&worker->workers->lock, flags); } + return ret; } /* - * helper function to move a thread off the idle list after new - * pending work is added. + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook */ -static void check_busy_worker(struct btrfs_worker_thread *worker) +static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) { - if (worker->idle && atomic_read(&worker->num_pending) >= - worker->workers->idle_thresh) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 0; - - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); } -static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +/* + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. + */ +static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - struct btrfs_workers *workers = worker->workers; - struct worker_start *start; - unsigned long flags; + int new_max_active; + long pending; + int need_change = 0; - rmb(); - if (!workers->atomic_start_pending) + if (wq->thresh == NO_THRESHOLD) return; - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return; - - start->work.func = start_new_worker_func; - start->queue = workers; - - spin_lock_irqsave(&workers->lock, flags); - if (!workers->atomic_start_pending) - goto out; - - workers->atomic_start_pending = 0; - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) - goto out; - - workers->num_workers_starting += 1; - spin_unlock_irqrestore(&workers->lock, flags); - btrfs_queue_worker(workers->atomic_worker_start, &start->work); - return; + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_max_active = wq->current_max; + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_max_active++; + if (pending < wq->thresh / 2) + new_max_active--; + new_max_active = clamp_val(new_max_active, 1, wq->max_active); + if (new_max_active != wq->current_max) { + need_change = 1; + wq->current_max = new_max_active; + } out: - kfree(start); - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_max); + } } -static noinline void run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void run_ordered_work(struct __btrfs_workqueue *wq) { - if (!workers->ordered) - return; - - set_bit(WORK_DONE_BIT, &work->flags); - - spin_lock(&workers->order_lock); + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; while (1) { - if (!list_empty(&workers->prio_order_list)) { - work = list_entry(workers->prio_order_list.next, - struct btrfs_work, order_list); - } else if (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - } else { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) break; - } + work = list_entry(list->next, struct btrfs_work, + ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; - /* we are going to call the ordered done function, but + /* + * we are going to call the ordered done function, but * we leave the work item on the list as a barrier so * that later work items that are done don't have their * functions called before this one returns */ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - - spin_unlock(&workers->order_lock); - + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); work->ordered_func(work); /* now take the lock again and drop our item from the list */ - spin_lock(&workers->order_lock); - list_del(&work->order_list); - spin_unlock(&workers->order_lock); + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); /* * we don't want to call the ordered free functions * with the lock held though */ work->ordered_free(work); - spin_lock(&workers->order_lock); - } - - spin_unlock(&workers->order_lock); -} - -static void put_worker(struct btrfs_worker_thread *worker) -{ - if (atomic_dec_and_test(&worker->refs)) - kfree(worker); -} - -static int try_worker_shutdown(struct btrfs_worker_thread *worker) -{ - int freeit = 0; - - spin_lock_irq(&worker->lock); - spin_lock(&worker->workers->lock); - if (worker->workers->num_workers > 1 && - worker->idle && - !worker->working && - !list_empty(&worker->worker_list) && - list_empty(&worker->prio_pending) && - list_empty(&worker->pending) && - atomic_read(&worker->num_pending) == 0) { - freeit = 1; - list_del_init(&worker->worker_list); - worker->workers->num_workers--; + trace_btrfs_all_work_done(work); } - spin_unlock(&worker->workers->lock); - spin_unlock_irq(&worker->lock); - - if (freeit) - put_worker(worker); - return freeit; + spin_unlock_irqrestore(lock, flags); } -static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, - struct list_head *prio_head, - struct list_head *head) -{ - struct btrfs_work *work = NULL; - struct list_head *cur = NULL; - - if (!list_empty(prio_head)) - cur = prio_head->next; - - smp_mb(); - if (!list_empty(&worker->prio_pending)) - goto refill; - - if (!list_empty(head)) - cur = head->next; - - if (cur) - goto out; - -refill: - spin_lock_irq(&worker->lock); - list_splice_tail_init(&worker->prio_pending, prio_head); - list_splice_tail_init(&worker->pending, head); - - if (!list_empty(prio_head)) - cur = prio_head->next; - else if (!list_empty(head)) - cur = head->next; - spin_unlock_irq(&worker->lock); - - if (!cur) - goto out_fail; - -out: - work = list_entry(cur, struct btrfs_work, list); - -out_fail: - return work; -} - -/* - * main loop for servicing work items - */ -static int worker_loop(void *arg) +static void normal_work_helper(struct work_struct *arg) { - struct btrfs_worker_thread *worker = arg; - struct list_head head; - struct list_head prio_head; struct btrfs_work *work; + struct __btrfs_workqueue *wq; + int need_order = 0; - INIT_LIST_HEAD(&head); - INIT_LIST_HEAD(&prio_head); - - do { -again: - while (1) { - - - work = get_next_work(worker, &prio_head, &head); - if (!work) - break; - - list_del(&work->list); - clear_bit(WORK_QUEUED_BIT, &work->flags); - - work->worker = worker; - - work->func(work); - - atomic_dec(&worker->num_pending); - /* - * unless this is an ordered work queue, - * 'work' was probably freed by func above. - */ - run_ordered_completions(worker->workers, work); - - check_pending_worker_creates(worker); - cond_resched(); - } - - spin_lock_irq(&worker->lock); - check_idle_worker(worker); - - if (freezing(current)) { - worker->working = 0; - spin_unlock_irq(&worker->lock); - try_to_freeze(); - } else { - spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) { - cpu_relax(); - /* - * we've dropped the lock, did someone else - * jump_in? - */ - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - /* - * this short schedule allows more work to - * come in without the queue functions - * needing to go through wake_up_process() - * - * worker->working is still 1, so nobody - * is going to try and wake us up - */ - schedule_timeout(1); - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - if (kthread_should_stop()) - break; - - /* still no more work?, sleep for real */ - spin_lock_irq(&worker->lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) { - spin_unlock_irq(&worker->lock); - set_current_state(TASK_RUNNING); - goto again; - } - - /* - * this makes sure we get a wakeup when someone - * adds something new to the queue - */ - worker->working = 0; - spin_unlock_irq(&worker->lock); - - if (!kthread_should_stop()) { - schedule_timeout(HZ * 120); - if (!worker->working && - try_worker_shutdown(worker)) { - return 0; - } - } - } - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will wait for all the worker threads to shutdown - */ -void btrfs_stop_workers(struct btrfs_workers *workers) -{ - struct list_head *cur; - struct btrfs_worker_thread *worker; - int can_stop; - - spin_lock_irq(&workers->lock); - workers->stopping = 1; - list_splice_init(&workers->idle_list, &workers->worker_list); - while (!list_empty(&workers->worker_list)) { - cur = workers->worker_list.next; - worker = list_entry(cur, struct btrfs_worker_thread, - worker_list); - - atomic_inc(&worker->refs); - workers->num_workers -= 1; - if (!list_empty(&worker->worker_list)) { - list_del_init(&worker->worker_list); - put_worker(worker); - can_stop = 1; - } else - can_stop = 0; - spin_unlock_irq(&workers->lock); - if (can_stop) - kthread_stop(worker->task); - spin_lock_irq(&workers->lock); - put_worker(worker); + work = container_of(arg, struct btrfs_work, normal_work); + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + wq = work->wq; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq); } - spin_unlock_irq(&workers->lock); + if (!need_order) + trace_btrfs_all_work_done(work); } -/* - * simple init on struct btrfs_workers - */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_helper) +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free) { - workers->num_workers = 0; - workers->num_workers_starting = 0; - INIT_LIST_HEAD(&workers->worker_list); - INIT_LIST_HEAD(&workers->idle_list); - INIT_LIST_HEAD(&workers->order_list); - INIT_LIST_HEAD(&workers->prio_order_list); - spin_lock_init(&workers->lock); - spin_lock_init(&workers->order_lock); - workers->max_workers = max; - workers->idle_thresh = 32; - workers->name = name; - workers->ordered = 0; - workers->atomic_start_pending = 0; - workers->atomic_worker_start = async_helper; - workers->stopping = 0; + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, normal_work_helper); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; } -/* - * starts new worker threads. This does not enforce the max worker - * count in case you need to temporarily go past it. - */ -static int __btrfs_start_workers(struct btrfs_workers *workers) +static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - int ret = 0; - - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } - - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_create(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - goto fail; - } + unsigned long flags; - spin_lock_irq(&workers->lock); - if (workers->stopping) { - spin_unlock_irq(&workers->lock); - ret = -EINVAL; - goto fail_kthread; + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); } - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); - - wake_up_process(worker->task); - return 0; - -fail_kthread: - kthread_stop(worker->task); -fail: - kfree(worker); - spin_lock_irq(&workers->lock); - workers->num_workers_starting--; - spin_unlock_irq(&workers->lock); - return ret; + queue_work(wq->normal_wq, &work->normal_work); + trace_btrfs_work_queued(work); } -int btrfs_start_workers(struct btrfs_workers *workers) +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work) { - spin_lock_irq(&workers->lock); - workers->num_workers_starting++; - spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers); -} - -/* - * run through the list and find a worker thread that doesn't have a lot - * to do right now. This can return null if we aren't yet at the thread - * count limit and all of the threads are busy. - */ -static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) -{ - struct btrfs_worker_thread *worker; - struct list_head *next; - int enforce_min; - - enforce_min = (workers->num_workers + workers->num_workers_starting) < - workers->max_workers; - - /* - * if we find an idle thread, don't move it to the end of the - * idle list. This improves the chance that the next submission - * will reuse the same thread, and maybe catch it while it is still - * working - */ - if (!list_empty(&workers->idle_list)) { - next = workers->idle_list.next; - worker = list_entry(next, struct btrfs_worker_thread, - worker_list); - return worker; - } - if (enforce_min || list_empty(&workers->worker_list)) - return NULL; - - /* - * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced. - * Do the move in batches based on the sequence number. This groups - * requests submitted at roughly the same time onto the same worker. - */ - next = workers->worker_list.next; - worker = list_entry(next, struct btrfs_worker_thread, worker_list); - worker->sequence++; + struct __btrfs_workqueue *dest_wq; - if (worker->sequence % workers->idle_thresh == 0) - list_move_tail(next, &workers->worker_list); - return worker; + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) + dest_wq = wq->high; + else + dest_wq = wq->normal; + __btrfs_queue_work(dest_wq, work); } -/* - * selects a worker thread to take the next job. This will either find - * an idle worker, start a new worker up to the max count, or just return - * one of the existing busy workers. - */ -static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker; - unsigned long flags; - struct list_head *fallback; - int ret; - - spin_lock_irqsave(&workers->lock, flags); -again: - worker = next_worker(workers); - - if (!worker) { - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) { - goto fallback; - } else if (workers->atomic_worker_start) { - workers->atomic_start_pending = 1; - goto fallback; - } else { - workers->num_workers_starting++; - spin_unlock_irqrestore(&workers->lock, flags); - /* we're below the limit, start another worker */ - ret = __btrfs_start_workers(workers); - spin_lock_irqsave(&workers->lock, flags); - if (ret) - goto fallback; - goto again; - } - } - goto found; - -fallback: - fallback = NULL; - /* - * we have failed to find any workers, just - * return the first one we can find. - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); -found: - /* - * this makes sure the worker doesn't exit before it is placed - * onto a busy/idle list - */ - atomic_inc(&worker->num_pending); - spin_unlock_irqrestore(&workers->lock, flags); - return worker; + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); } -/* - * btrfs_requeue_work just puts the work item back on the tail of the list - * it was taken from. It is intended for use with long running work functions - * that make some progress and want to give the cpu up for others. - */ -void btrfs_requeue_work(struct btrfs_work *work) +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker = work->worker; - unsigned long flags; - int wake = 0; - - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + if (!wq) return; - - spin_lock_irqsave(&worker->lock, flags); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); - - /* by definition we're busy, take ourselves off the idle - * list - */ - if (worker->idle) { - spin_lock(&worker->workers->lock); - worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - spin_unlock(&worker->workers->lock); - } - if (!worker->working) { - wake = 1; - worker->working = 1; - } - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + if (wq->high) + __btrfs_destroy_workqueue(wq->high); + __btrfs_destroy_workqueue(wq->normal); + kfree(wq); } -void btrfs_set_work_high_prio(struct btrfs_work *work) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + wq->normal->max_active = max; + if (wq->high) + wq->high->max_active = max; } -/* - * places a struct btrfs_work into the pending queue of one of the kthreads - */ -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_set_work_high_priority(struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - unsigned long flags; - int wake = 0; - - /* don't requeue something already on a list */ - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - return; - - worker = find_worker(workers); - if (workers->ordered) { - /* - * you're not allowed to do ordered queues from an - * interrupt handler - */ - spin_lock(&workers->order_lock); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { - list_add_tail(&work->order_list, - &workers->prio_order_list); - } else { - list_add_tail(&work->order_list, &workers->order_list); - } - spin_unlock(&workers->order_lock); - } else { - INIT_LIST_HEAD(&work->order_list); - } - - spin_lock_irqsave(&worker->lock, flags); - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); - - /* - * avoid calling into wake_up_process if this thread has already - * been kicked - */ - if (!worker->working) - wake = 1; - worker->working = 1; - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1f26792683e..9c6b66d15fb 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -19,103 +20,35 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ -struct btrfs_worker_thread; +struct btrfs_workqueue; +/* Internal use only */ +struct __btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); -/* - * This is similar to a workqueue, but it is meant to spread the operations - * across all available cpus instead of just the CPU that was used to - * queue the work. There is also some batching introduced to try and - * cut down on context switches. - * - * By default threads are added on demand up to 2 * the number of cpus. - * Changing struct btrfs_workers->max_workers is one way to prevent - * demand creation of kthreads. - * - * the basic model of these worker threads is to embed a btrfs_work - * structure in your own data struct, and use container_of in a - * work function to get back to your data struct. - */ struct btrfs_work { - /* - * func should be set to the function you want called - * your work struct is passed as the only arg - * - * ordered_func must be set for work sent to an ordered work queue, - * and it is called to complete a given work item in the same - * order they were sent to the queue. - */ - void (*func)(struct btrfs_work *work); - void (*ordered_func)(struct btrfs_work *work); - void (*ordered_free)(struct btrfs_work *work); - - /* - * flags should be set to zero. It is used to make sure the - * struct is only inserted once into the list. - */ + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct __btrfs_workqueue *wq; unsigned long flags; - - /* don't touch these */ - struct btrfs_worker_thread *worker; - struct list_head list; - struct list_head order_list; -}; - -struct btrfs_workers { - /* current number of running workers */ - int num_workers; - - int num_workers_starting; - - /* max number of workers allowed. changed by btrfs_start_workers */ - int max_workers; - - /* once a worker has this many requests or fewer, it is idle */ - int idle_thresh; - - /* force completions in the order they were queued */ - int ordered; - - /* more workers required, but in an interrupt handler */ - int atomic_start_pending; - - /* - * are we allowed to sleep while starting workers or are we required - * to start them at a later time? If we can't sleep, this indicates - * which queue we need to use to schedule thread creation. - */ - struct btrfs_workers *atomic_worker_start; - - /* list with all the work threads. The workers on the idle thread - * may be actively servicing jobs, but they haven't yet hit the - * idle thresh limit above. - */ - struct list_head worker_list; - struct list_head idle_list; - - /* - * when operating in ordered mode, this maintains the list - * of work items waiting for completion - */ - struct list_head order_list; - struct list_head prio_order_list; - - /* lock for finding the next worker thread to queue on */ - spinlock_t lock; - - /* lock for the ordered lists */ - spinlock_t order_lock; - - /* extra name for this worker, used for current->name */ - char *name; - - int stopping; }; -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers); -void btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_starter); -void btrfs_requeue_work(struct btrfs_work *work); -void btrfs_set_work_high_prio(struct btrfs_work *work); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh); +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +void btrfs_set_work_high_priority(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index aded3ef3d3d..aad7201ad11 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct __prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos) + int level, u64 time_seq, const u64 *extent_item_pos, + u64 total_refs) { int ret = 0; int slot; @@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret && count < ref->count) { + while (!ret && count < total_refs) { eb = path->nodes[0]; slot = path->slots[0]; @@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct __prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { struct btrfs_root *root; struct btrfs_key root_key; @@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos); + extent_item_pos, total_refs); out: path->lowest_level = 0; btrfs_release_path(path); @@ -374,7 +375,7 @@ out: static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct list_head *head, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { int err; int ret = 0; @@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos); + parents, extent_item_pos, + total_refs); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs) + struct list_head *prefs, u64 *total_refs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; @@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, default: BUG_ON(1); } + *total_refs += (node->ref_mod * sgn); switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { struct btrfs_delayed_tree_ref *ref; @@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - int *info_level, struct list_head *prefs) + int *info_level, struct list_head *prefs, + u64 *total_refs) { int ret = 0; int slot; @@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs; struct __prelim_ref *ref; struct extent_inode_elem *eie = NULL; + u64 total_refs = 0; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) + if (!trans) { path->search_commit_root = 1; + path->skip_locking = 1; + } /* * grab both a lock on the path and a lock on the delayed ref head. @@ -915,7 +923,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, - &prefs_delayed); + &prefs_delayed, &total_refs); mutex_unlock(&head->mutex); if (ret) goto out; @@ -936,7 +944,8 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_level, &prefs); + &info_level, &prefs, + &total_refs); if (ret) goto out; ret = __add_keyed_refs(fs_info, path, bytenr, @@ -956,7 +965,7 @@ again: __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos); + extent_item_pos, total_refs); if (ret) goto out; @@ -965,7 +974,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); WARN_ON(ref->count < 0); - if (ref->count && ref->root_id && ref->parent == 0) { + if (roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) @@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { - struct ulist *tmp; int ret; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) { - ulist_free(tmp); + if (!*leafs) return -ENOMEM; - } ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, *leafs, tmp, extent_item_pos); - ulist_free(tmp); - + time_seq, *leafs, NULL, extent_item_pos); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, if (ret < 0) return ret; - while (1) { - u32 nritems; - if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); - ret = btrfs_prev_leaf(fs_info->extent_root, path); - if (ret != 0) { - if (ret > 0) { - pr_debug("logical %llu is not within " - "any extent\n", logical); - ret = -ENOENT; - } - return ret; - } - } else { - path->slots[0]--; - } - nritems = btrfs_header_nritems(path->nodes[0]); - if (nritems == 0) { - pr_debug("logical %llu is not within any extent\n", - logical); - return -ENOENT; - } - if (path->slots[0] == nritems) - path->slots[0]--; - - btrfs_item_key_to_cpu(path->nodes[0], found_key, - path->slots[0]); - if (found_key->type == BTRFS_EXTENT_ITEM_KEY || - found_key->type == BTRFS_METADATA_ITEM_KEY) - break; + ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; } - + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8fed2125689..c9a24444ec9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -109,14 +109,17 @@ struct btrfs_inode { u64 last_trans; /* - * log transid when this inode was last modified + * transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* - * transid that last logged this inode + * log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file @@ -155,9 +158,6 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; - /* a local copy of root's last_log_commit */ - unsigned long last_log_commit; - /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cbd3a7d6fa6..88d1b1eedc9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root, int advance_right; u64 left_blockptr; u64 right_blockptr; + u64 left_gen; + u64 right_gen; u64 left_start_ctransid; u64 right_start_ctransid; u64 ctransid; @@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_blockptr = btrfs_node_blockptr( right_path->nodes[right_level], right_path->slots[right_level]); - if (left_blockptr == right_blockptr) { + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c1a42ca519..bc96c03dd25 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 +#define BTRFS_FS_STATE_DEV_REPLACING 3 /* Super block flags */ /* Errors detected */ @@ -1489,6 +1490,7 @@ struct btrfs_fs_info { */ struct list_head ordered_roots; + struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; @@ -1503,28 +1505,27 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ - struct btrfs_workers generic_worker; - struct btrfs_workers workers; - struct btrfs_workers delalloc_workers; - struct btrfs_workers flush_workers; - struct btrfs_workers endio_workers; - struct btrfs_workers endio_meta_workers; - struct btrfs_workers endio_raid56_workers; - struct btrfs_workers rmw_workers; - struct btrfs_workers endio_meta_write_workers; - struct btrfs_workers endio_write_workers; - struct btrfs_workers endio_freespace_worker; - struct btrfs_workers submit_workers; - struct btrfs_workers caching_workers; - struct btrfs_workers readahead_workers; + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct btrfs_workqueue *endio_workers; + struct btrfs_workqueue *endio_meta_workers; + struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *rmw_workers; + struct btrfs_workqueue *endio_meta_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *submit_workers; + struct btrfs_workqueue *caching_workers; + struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ - struct btrfs_workers fixup_workers; - struct btrfs_workers delayed_workers; + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; @@ -1604,9 +1605,9 @@ struct btrfs_fs_info { atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; int scrub_workers_refcnt; - struct btrfs_workers scrub_workers; - struct btrfs_workers scrub_wr_completion_workers; - struct btrfs_workers scrub_nocow_workers; + struct btrfs_workqueue *scrub_workers; + struct btrfs_workqueue *scrub_wr_completion_workers; + struct btrfs_workqueue *scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1647,7 +1648,7 @@ struct btrfs_fs_info { /* qgroup rescan items */ struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; - struct btrfs_workers qgroup_rescan_workers; + struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; @@ -1674,10 +1675,18 @@ struct btrfs_fs_info { atomic_t mutually_exclusive_operation_running; + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; + struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; }; +struct btrfs_subvolume_writers { + struct percpu_counter counter; + wait_queue_head_t wait; +}; + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. @@ -1714,11 +1723,15 @@ struct btrfs_root { struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; - unsigned long log_transid; - unsigned long last_log_commit; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; pid_t log_start_pid; bool log_multiple_pids; @@ -1793,6 +1806,7 @@ struct btrfs_root { spinlock_t root_item_lock; atomic_t refs; + struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for @@ -1802,6 +1816,8 @@ struct btrfs_root { struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents @@ -1822,6 +1838,8 @@ struct btrfs_root { * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; + struct btrfs_subvolume_writers *subv_writers; + atomic_t will_be_snapshoted; }; struct btrfs_ioctl_defrag_range_args { @@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); + +int btrfs_start_nocow_write(struct btrfs_root *root); +void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* dev-replace.c */ +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); + /* reada.c */ struct reada_control { struct btrfs_root *root; /* tree to prefetch */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 451b00c86f6..33e561a8401 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - async_work->work.func = btrfs_async_run_delayed_root; - async_work->work.flags = 0; + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, + NULL, NULL); async_work->nr = nr; - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); return 0; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index f3bff89eecf..31299646024 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, */ static struct btrfs_delayed_ref_head * find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_head **last, int return_bigger) + int return_bigger) { struct rb_node *n; struct btrfs_delayed_ref_head *entry; - int cmp = 0; -again: n = root->rb_node; entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (last) - *last = entry; if (bytenr < entry->node.bytenr) - cmp = -1; - else if (bytenr > entry->node.bytenr) - cmp = 1; - else - cmp = 0; - - if (cmp < 0) n = n->rb_left; - else if (cmp > 0) + else if (bytenr > entry->node.bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (cmp > 0) { + if (bytenr > entry->node.bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - bytenr = entry->node.bytenr; - return_bigger = 0; - goto again; + return entry; } return entry; } @@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans) again: start = delayed_refs->run_delayed_start; - head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + head = find_ref_head(&delayed_refs->href_root, start, 1); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + head = find_ref_head(&delayed_refs->href_root, start, 1); if (!head) return NULL; } else if (!head && loop) { @@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref = btrfs_delayed_node_to_head(update); BUG_ON(existing_ref->is_data != ref->is_data); + spin_lock(&existing_ref->lock); if (ref->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - spin_lock(&existing_ref->lock); existing->ref_mod += update->ref_mod; spin_unlock(&existing_ref->lock); } @@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); + return find_ref_head(&delayed_refs->href_root, bytenr, 0); } void btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 564c92638b2..9f2290509ac 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -431,6 +431,35 @@ leave_no_lock: return ret; } +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + s64 writers; + DEFINE_WAIT(wait); + + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + do { + prepare_to_wait(&fs_info->replace_wait, &wait, + TASK_UNINTERRUPTIBLE); + writers = percpu_counter_sum(&fs_info->bio_counter); + if (writers) + schedule(); + finish_wait(&fs_info->replace_wait, &wait); + } while (writers); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device = dev_replace->srcdev; btrfs_dev_replace_unlock(dev_replace); - /* replace old device with new one in mapping tree */ - if (!scrub_ret) - btrfs_dev_replace_update_device_in_mapping_tree(fs_info, - src_device, - tgt_device); - /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = @@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - if (scrub_ret) { + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { printk_in_rcu(KERN_ERR "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", src_device->missing ? "<missing disk>" : @@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + btrfs_rm_dev_replace_blocked(fs_info); + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_unblocked(fs_info); + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the @@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) mutex_unlock(&dev_replace->lock_management_lock); } } + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + percpu_counter_dec(&fs_info->bio_counter); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + DEFINE_WAIT(wait); +again: + percpu_counter_inc(&fs_info->bio_counter); + if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + goto again; + } + +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 81ea55314b1..bd0f752b797 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err) fs_info = end_io_wq->info; end_io_wq->error = err; - end_io_wq->work.func = end_workqueue_fn; - end_io_wq->work.flags = 0; + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_worker(&fs_info->endio_meta_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_write_workers, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_worker(&fs_info->endio_freespace_worker, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_freespace_worker, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_write_workers, + &end_io_wq->work); } else { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_workers, + &end_io_wq->work); } } @@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) { unsigned long limit = min_t(unsigned long, - info->workers.max_workers, + info->thread_pool_size, info->fs_devices->open_devices); return 256 * limit; } @@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - async->work.func = run_one_async_start; - async->work.ordered_func = run_one_async_done; - async->work.ordered_free = run_one_async_free; + btrfs_init_work(&async->work, run_one_async_start, + run_one_async_done, run_one_async_free); - async->work.flags = 0; async->bio_flags = bio_flags; async->bio_offset = bio_offset; @@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) - btrfs_set_work_high_prio(&async->work); + btrfs_set_work_high_priority(&async->work); - btrfs_queue_worker(&fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } +static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) +{ + struct btrfs_subvolume_writers *writers; + int ret; + + writers = kmalloc(sizeof(*writers), GFP_NOFS); + if (!writers) + return ERR_PTR(-ENOMEM); + + ret = percpu_counter_init(&writers->counter, 0); + if (ret < 0) { + kfree(writers); + return ERR_PTR(ret); + } + + init_waitqueue_head(&writers->wait); + return writers; +} + +static void +btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) +{ + percpu_counter_destroy(&writers->counter); + kfree(writers); +} + static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, @@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); + atomic_set(&root->will_be_snapshoted, 0); root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; if (fs_info) extent_io_tree_init(&root->dirty_log_pages, @@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } @@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root) { int ret; + struct btrfs_subvolume_writers *writers; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } + writers = btrfs_alloc_subvolume_writers(); + if (IS_ERR(writers)) { + ret = PTR_ERR(writers); + goto fail; + } + root->subv_writers = writers; + btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); @@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + goto free_writers; return 0; + +free_writers: + btrfs_free_subvolume_writers(root->subv_writers); fail: kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); @@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->flush_workers); - btrfs_stop_workers(&fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + btrfs_destroy_workqueue(fs_info->endio_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->rmw_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->submit_workers); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->readahead_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; + int max_active; + int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; bool create_uuid_tree; bool check_uuid_tree; @@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } + ret = percpu_counter_init(&fs_info->bio_counter, 0); + if (ret) { + err = ret; + goto fail_delalloc_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_delalloc_bytes; + goto fail_bio_counter; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); @@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->replace_wait); init_waitqueue_head(&fs_info->scrub_pause_wait); fs_info->scrub_workers_refcnt = 0; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - btrfs_init_workers(&fs_info->generic_worker, - "genwork", 1, NULL); - - btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size, - &fs_info->generic_worker); + max_active = fs_info->thread_pool_size; - btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, NULL); + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); - btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", - fs_info->thread_pool_size, NULL); + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - btrfs_init_workers(&fs_info->submit_workers, "submit", - min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), NULL); + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - btrfs_init_workers(&fs_info->caching_workers, "cache", - fs_info->thread_pool_size, NULL); + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); - /* a higher idle thresh on the submit workers makes it much more + /* + * a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices */ - fs_info->submit_workers.idle_thresh = 64; - - fs_info->workers.idle_thresh = 16; - fs_info->workers.ordered = 1; - - fs_info->delalloc_workers.idle_thresh = 2; - fs_info->delalloc_workers.ordered = 1; - - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_raid56_workers, - "endio-raid56", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->rmw_workers, - "rmw", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", - 1, &fs_info->generic_worker); - btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->readahead_workers, "readahead", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, - &fs_info->generic_worker); + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ - fs_info->endio_workers.idle_thresh = 4; - fs_info->endio_meta_workers.idle_thresh = 4; - fs_info->endio_raid56_workers.idle_thresh = 4; - fs_info->rmw_workers.idle_thresh = 2; - - fs_info->endio_write_workers.idle_thresh = 2; - fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->readahead_workers.idle_thresh = 2; - - /* - * btrfs_start_workers can really only fail because of ENOMEM so just - * return -ENOMEM if any of these fail. - */ - ret = btrfs_start_workers(&fs_info->workers); - ret |= btrfs_start_workers(&fs_info->generic_worker); - ret |= btrfs_start_workers(&fs_info->submit_workers); - ret |= btrfs_start_workers(&fs_info->delalloc_workers); - ret |= btrfs_start_workers(&fs_info->fixup_workers); - ret |= btrfs_start_workers(&fs_info->endio_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_workers); - ret |= btrfs_start_workers(&fs_info->rmw_workers); - ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); - ret |= btrfs_start_workers(&fs_info->delayed_workers); - ret |= btrfs_start_workers(&fs_info->caching_workers); - ret |= btrfs_start_workers(&fs_info->readahead_workers); - ret |= btrfs_start_workers(&fs_info->flush_workers); - ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); - if (ret) { + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; } @@ -2963,6 +2977,8 @@ fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); +fail_bio_counter: + percpu_counter_destroy(&fs_info->bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_send++; continue; @@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_wait++; continue; @@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root) root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); + if (root->subv_writers) + btrfs_free_subvolume_writers(root->subv_writers); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); @@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->bio_counter); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); @@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) list_move_tail(&root->ordered_root, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); btrfs_destroy_ordered_extents(root); - cond_resched_lock(&fs_info->ordered_root_lock); + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 32312e09f0f..c6b6a6e3e73 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, btrfs_get_block_group(cache); - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root, } static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) + unsigned long nr_pages, int nr_items) { struct super_block *sb = root->fs_info->sb; @@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_roots(root->fs_info, 0); + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, nr_items); } } @@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - btrfs_writeback_inodes_sb_nr(root, nr_pages); + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before * we do anything. @@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root, goto commit; /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes) >= 0) { - spin_unlock(&space_info->lock); + bytes) >= 0) goto commit; - } - spin_unlock(&space_info->lock); /* * See if there is some space in the delayed insertion reservation for @@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; - spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); @@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, + shrink_delalloc(root, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) range->len = trimmed; return ret; } + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ + if (unlikely(atomic_read(&root->will_be_snapshoted))) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (unlikely(atomic_read(&root->will_be_snapshoted))) { + btrfs_end_nocow_write(root); + return 0; + } + return 1; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 85bbd01f127..ae69a00387e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, struct rb_node *node, struct rb_node ***p_in, struct rb_node **parent_in) { - struct rb_node **p = &root->rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct tree_entry *entry; @@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, goto do_insert; } + p = search_start ? &search_start : &root->rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree, set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node, p, parent); + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); @@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, - NULL, NULL); + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, if (em_cached && *em_cached) { em = *em_cached; - if (em->in_tree && start >= em->start && + if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { atomic_inc(&em->refs); return em; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 996ad56b57d..1874aee69c8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void) em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; @@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em) return; WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { - WARN_ON(em->in_tree); + WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - WARN_ON(!entry->in_tree); - if (em->start < entry->start) p = &(*p)->rb_left; else if (em->start >= extent_map_end(entry)) @@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (end > entry->start && em->start < extent_map_end(entry)) return -EEXIST; - em->in_tree = 1; rb_link_node(&em->rb_node, orig_parent, p); rb_insert_color(&em->rb_node, root); return 0; @@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, prev = n; prev_entry = entry; - WARN_ON(!entry->in_tree); - if (offset < entry->start) n = n->rb_left; else if (offset >= extent_map_end(entry)) @@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - merge->in_tree = 0; em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } } @@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; + RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); free_extent_map(merge); @@ -319,7 +314,21 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - if (em->in_tree) + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else try_merge_map(tree, em); } @@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree, if (ret) goto out; - atomic_inc(&em->refs); - - em->mod_start = em->start; - em->mod_len = em->len; - - if (modified) - list_move(&em->list, &tree->modified_extents); - else - try_merge_map(tree, em); + setup_extent_mapping(tree, em, modified); out: return ret; } @@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); return ret; } + +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 93fba716d7f..e7fd8a56a14 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -33,7 +33,6 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree; unsigned int compress_type; struct list_head list; }; @@ -44,6 +43,11 @@ struct extent_map_tree { rwlock_t lock; }; +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + static inline u64 extent_map_end(struct extent_map *em) { if (em->start + em->len < em->start) @@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7331a230e30..e1ffb1e2289 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); - remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size) + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; while (1) { @@ -798,7 +804,10 @@ next_slot: */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EINVAL; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; @@ -841,7 +850,10 @@ next_slot: * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EINVAL; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; @@ -864,7 +876,10 @@ next_slot: */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EINVAL; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); @@ -938,34 +953,42 @@ next_slot: * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted - * path->slots[0] for our insertion. + * path->slots[0] for our insertion (if replace_extent != 0). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); + } - leaf = path->nodes[0]; - /* - * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that - * is, its contents got pushed to its neighbors), in which case - * it means path->locks[0] == 0 - */ - if (!ret && replace_extent && leafs_visited == 1 && - path->locks[0] && - btrfs_leaf_free_space(root, leaf) >= - sizeof(struct btrfs_item) + extent_item_size) { - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); - *key_inserted = 1; + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; } if (!replace_extent || !(*key_inserted)) @@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, last_pos, 0, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { - btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start_pos, last_pos, cached_state, GFP_NOFS); @@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, unlock_page(pages[i]); page_cache_release(pages[i]); } - ret = btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos + 1); - if (ret) - return ret; - else - return -EAGAIN; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); @@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; + ret = btrfs_start_nocow_write(root); + if (!ret) + return -ENOSPC; + lockstart = round_down(pos, root->sectorsize); - lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; while (1) { lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; + btrfs_end_nocow_write(root); } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - NULL, GFP_NOFS); - *write_bytes = min_t(size_t, *write_bytes, num_bytes); + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); } unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!only_release_metadata) btrfs_free_reserved_data_space(inode, reserve_bytes); + else + btrfs_end_nocow_write(root); break; } @@ -1598,6 +1622,9 @@ again: } release_bytes = 0; + if (only_release_metadata) + btrfs_end_nocow_write(root); + if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); u64 lockend = lockstart + @@ -1624,10 +1651,12 @@ again: kfree(pages); if (release_bytes) { - if (only_release_metadata) + if (only_release_metadata) { + btrfs_end_nocow_write(root); btrfs_delalloc_release_metadata(inode, release_bytes); - else + } else { btrfs_delalloc_release_space(inode, release_bytes); + } } return num_written ? num_written : ret; @@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { - ret = btrfs_sync_log(trans, root); + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans, root); goto out; @@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page = ((offset >> PAGE_CACHE_SHIFT) == ((offset + len - 1) >> PAGE_CACHE_SHIFT)); bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset < ino_size) { ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset + len < ino_size) { ret = btrfs_truncate_page(inode, offset + len, 0, 1); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - break; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } } cur_offset = drop_end; @@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - goto out_trans; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } } out_trans: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 49ec1398879..06e9a4152b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode, if (btrfs_is_free_space_inode(inode)) { WARN_ON_ONCE(1); - return -EINVAL; + ret = -EINVAL; + goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); @@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - async_cow->work.func = async_cow_start; - async_cow->work.ordered_func = async_cow_submit; - async_cow->work.ordered_free = async_cow_free; - async_cow->work.flags = 0; + btrfs_init_work(&async_cow->work, async_cow_start, + async_cow_submit, async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); - btrfs_queue_worker(&root->fs_info->delalloc_workers, - &async_cow->work); + btrfs_queue_work(root->fs_info->delalloc_workers, + &async_cow->work); if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { wait_event(root->fs_info->async_submit_wait, @@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - fixup->work.func = btrfs_writepage_fixup_worker; + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; } @@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return PTR_ERR(root); } + if (btrfs_root_readonly(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + /* step 2: get inode */ key.objectid = backref->inum; key.type = BTRFS_INODE_ITEM_KEY; @@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workers *workers; + struct btrfs_workqueue *workers; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - ordered_extent->work.func = finish_ordered_fn; - ordered_extent->work.flags = 0; + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); if (btrfs_is_free_space_inode(inode)) - workers = &root->fs_info->endio_freespace_worker; + workers = root->fs_info->endio_freespace_worker; else - workers = &root->fs_info->endio_write_workers; - btrfs_queue_worker(workers, &ordered_extent->work); + workers = root->fs_info->endio_write_workers; + btrfs_queue_work(workers, &ordered_extent->work); return 0; } @@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - WARN_ON(btrfs_root_refs(&root->root_item) != 0); + if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: @@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); @@ -5872,6 +5877,7 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -5996,6 +6003,7 @@ out_fail: btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 disk_bytenr; @@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (btrfs_extent_readonly(root, disk_bytenr)) goto out; + + num_bytes = min(offset + *len, extent_end) - offset; + if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + btrfs_release_path(path); /* @@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - num_bytes = min(offset + *len, extent_end) - offset; if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out; /* @@ -7024,10 +7046,9 @@ again: if (!ret) goto out_test; - ordered->work.func = finish_ordered_fn; - ordered->work.flags = 0; - btrfs_queue_worker(&root->fs_info->endio_write_workers, - &ordered->work); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, smp_mb__after_atomic_inc(); /* - * The generic stuff only does filemap_write_and_wait_range, which isn't - * enough if we've written compressed pages to this area, so we need to - * call btrfs_wait_ordered_range to make absolutely sure that any - * outstanding dirty pages are on disk. + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. */ count = iov_length(iov, nr_segs); - ret = btrfs_wait_ordered_range(inode, offset, count); - if (ret) - return ret; + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, count); if (rw & WRITE) { /* @@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - work->work.func = btrfs_run_delalloc_work; + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, + int nr) { struct btrfs_inode *binode; struct inode *inode; @@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); + mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) else iput(inode); ret = -ENOMEM; - goto out; + break; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); - + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + ret++; + if (nr != -1 && ret >= nr) + break; cond_resched(); spin_lock(&root->delalloc_lock); } @@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } - return 0; -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } if (!list_empty_careful(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } + mutex_unlock(&root->delalloc_mutex); return ret; } @@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, -1); + if (ret > 0) + ret = 0; /* * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that @@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return ret; } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr) { struct btrfs_root *root; struct list_head splice; @@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_fs_root(root); @@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, nr); btrfs_put_fs_root(root); - if (ret) + if (ret < 0) goto out; + if (nr != -1) { + nr -= ret; + WARN_ON(nr < 0); + } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); + ret = 0; atomic_inc(&fs_info->async_submit_draining); while (atomic_read(&fs_info->nr_async_submits) || atomic_read(&fs_info->async_delalloc_pages)) { @@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) atomic_read(&fs_info->async_delalloc_pages) == 0)); } atomic_dec(&fs_info->async_submit_draining); - return 0; out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } + mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a6d8efa46bf..0401397b5c9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,32 @@ #include "props.h" #include "sysfs.h" +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + + static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff); @@ -585,6 +611,23 @@ fail: return ret; } +static void btrfs_wait_nocow_write(struct btrfs_root *root) +{ + s64 writers; + DEFINE_WAIT(wait); + + do { + prepare_to_wait(&root->subv_writers->wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&root->subv_writers->counter); + if (writers) + schedule(); + + finish_wait(&root->subv_writers->wait, &wait); + } while (writers); +} + static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, bool readonly, @@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root->ref_cows) return -EINVAL; + atomic_inc(&root->will_be_snapshoted); + smp_mb__after_atomic_inc(); + btrfs_wait_nocow_write(root); + ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - return ret; + goto out; btrfs_wait_ordered_extents(root, -1); pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) - return -ENOMEM; + if (!pending_snapshot) { + ret = -ENOMEM; + goto out; + } btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto out; + goto free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -674,8 +723,10 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -out: +free: kfree(pending_snapshot); +out: + atomic_dec(&root->will_be_snapshoted); return ret; } @@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root, min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - path->keep_locks = 1; - while (1) { + path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); +process_slot: if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) @@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root, return 0; } + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); + goto process_slot; + } + if (min_key.offset == (u64)-1) goto none; @@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) read_unlock(&em_tree->lock); if (!em) { + struct extent_state *cached = NULL; + u64 end = start + len - 1; + /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); + lock_extent_bits(io_tree, start, end, 0, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); + unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) return NULL; @@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || + (em->block_start + em->block_len == next->block_start)) ret = false; free_extent_map(next); @@ -1076,10 +1139,12 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end); + lock_extent_bits(tree, page_start, page_end, + 0, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end); + unlock_extent_cached(tree, page_start, page_end, + &cached_state, GFP_NOFS); if (!ordered) break; @@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { /* the filemap_flush will queue IO into the worker threads, but @@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(src_inode)->root->fs_info, "Snapshot src from another FS"); - ret = -EINVAL; + ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { /* * Subvolume creation is not restricted, but snapshots @@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { - ret = -ENOTEMPTY; + ret = -EPERM; + btrfs_err(root->fs_info, "deleting default subvolume " + "%llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); @@ -2994,8 +3065,9 @@ process_slot: new_key.offset + datal, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EINVAL) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * decompress into destination's address_space (the file offset * may change, so source mapping won't do), then recompress (or * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. */ /* the destination must be opened for writing */ @@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) return btrfs_qgroup_wait_for_completion(root->fs_info); } -static long btrfs_ioctl_set_received_subvol(struct file *file, - void __user *arg) +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct btrfs_ioctl_received_subvol_args *sa) { - struct btrfs_ioctl_received_subvol_args *sa = NULL; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; @@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) { - ret = PTR_ERR(sa); - sa = NULL; - goto out; - } - /* * 1 - root item * 2 - uuid items (received uuid + subvol uuid) @@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } +out: + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) { + ret = PTR_ERR(args32); + args32 = NULL; + goto out; + } + + args64 = kmalloc(sizeof(*args64), GFP_NOFS); + if (IS_ERR(args64)) { + ret = PTR_ERR(args64); + args64 = NULL; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + ret = _btrfs_ioctl_set_received_subvol(file, sa); + + if (ret) + goto out; + ret = copy_to_user(arg, sa, sizeof(*sa)); if (ret) ret = -EFAULT; out: kfree(sa); - up_write(&root->fs_info->subvol_sem); - mnt_drop_write_file(file); return ret; } @@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) return ret; ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); @@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif case BTRFS_IOC_SEND: return btrfs_ioctl_send(file, argp); case BTRFS_IOC_GET_DEV_STATS: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b16450b840e..a94b05f7286 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -410,10 +413,13 @@ have_entry: if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -424,27 +430,48 @@ out: } /* Needs to either be called under a log transaction or the log_mutex */ -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; - int index = log->log_transid % 2; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); for (n = rb_first(&tree->tree); n; n = rb_next(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); - spin_lock(&log->log_extents_lock[index]); - if (list_empty(&ordered->log_list)) { - list_add_tail(&ordered->log_list, &log->logged_list[index]); - atomic_inc(&ordered->refs); - } - spin_unlock(&log->log_extents_lock[index]); + if (!list_empty(&ordered->log_list)) + continue; + list_add_tail(&ordered->log_list, logged_list); + atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } +void btrfs_put_logged_extents(struct list_head *logged_list) +{ + struct btrfs_ordered_extent *ordered; + + while (!list_empty(logged_list)) { + ordered = list_first_entry(logged_list, + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log) +{ + int index = log->log_transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + list_splice_tail(logged_list, &log->logged_list[index]); + spin_unlock_irq(&log->log_extents_lock[index]); +} + void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; @@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); + mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); list_splice_init(&root->ordered_extents, &splice); while (!list_empty(&splice) && nr) { @@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) atomic_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); - ordered->flush_work.func = btrfs_run_ordered_extent_work; + btrfs_init_work(&ordered->flush_work, + btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); + btrfs_queue_work(root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); spin_lock(&root->ordered_extent_lock); @@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) btrfs_put_ordered_extent(ordered); cond_resched(); } - mutex_unlock(&root->fs_info->ordered_operations_mutex); + mutex_unlock(&root->ordered_extent_mutex); return count; } @@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); while (!list_empty(&splice) && nr) { @@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) } list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); } /* @@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, goto out; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 9b0450f7ac2..246897058ef 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list); +void btrfs_put_logged_extents(struct list_head *logged_list); +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 472302a2d74..2cf905877aa 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } ret = 0; } @@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); - fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { err: @@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); return 0; } @@ -2190,6 +2191,6 @@ void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9af0b25d991..4055291a523 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1416,20 +1416,18 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = rmw_work; + btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = read_rebuild_work; + btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } /* @@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - plug->work.flags = 0; - plug->work.func = unplug_work; - btrfs_queue_worker(&plug->info->rmw_workers, - &plug->work); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_queue_work(plug->info->rmw_workers, + &plug->work); return; } run_plug(plug); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 31c797c48c3..30947f92362 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - rmw->work.func = reada_start_machine_worker; + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; - btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); + btrfs_queue_work(fs_info->readahead_workers, &rmw->work); } #ifdef DEBUG diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 07b3b36f40e..def428a25b2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); if (ret < 0) { err = ret; goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 1389b69059d..38bb47e7d6b 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/err.h> #include <linux/uuid.h> #include "ctree.h" #include "transaction.h" @@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.offset++; root = btrfs_read_fs_root(tree_root, &root_key); - err = PTR_RET(root); + err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; } else if (err == -ENOENT) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efba5d1282e..93e6d717284 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + atomic_inc(&sctx->workers_pending); } @@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, + NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -987,9 +998,10 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); + btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, + NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); goto out; } @@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - sbio->work.func = scrub_wr_bio_end_io_worker; - btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) @@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); } static void scrub_bio_end_io_worker(struct btrfs_work *work) @@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - scrub_blocked_if_needed(fs_info); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); btrfs_put_block_group(cache); if (ret) @@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; + int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, - &fs_info->generic_worker); + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); else - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_workers.idle_thresh = 4; - ret = btrfs_start_workers(&fs_info->scrub_workers); - if (ret) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_wr_completion_workers, - "scrubwrc", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_wr_completion_workers.idle_thresh = 2; - ret = btrfs_start_workers( - &fs_info->scrub_wr_completion_workers); - if (ret) + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, - &fs_info->generic_worker); - ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); - if (ret) + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; goto out; + } } ++fs_info->scrub_workers_refcnt; out: @@ -2793,9 +2820,9 @@ out: static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_stop_workers(&fs_info->scrub_workers); - btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); - btrfs_stop_workers(&fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } @@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); - btrfs_queue_worker(&fs_info->scrub_nocow_workers, - &nocow_ctx->work); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); return 0; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9dde9717c1b..9b6da9d55f9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -51,15 +51,18 @@ struct fs_path { struct { char *start; char *end; - char *prepared; char *buf; - int buf_len; - unsigned int reversed:1; - unsigned int virtual_mem:1; + unsigned short buf_len:15; + unsigned short reversed:1; char inline_buf[]; }; - char pad[PAGE_SIZE]; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; }; }; #define FS_PATH_INLINE_SIZE \ @@ -109,6 +112,7 @@ struct send_ctx { int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 send_progress; @@ -120,6 +124,8 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; + struct file_ra_state ra; + char *read_buf; /* @@ -175,6 +181,47 @@ struct send_ctx { * own move/rename can be performed. */ struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; }; struct pending_dir_move { @@ -189,6 +236,18 @@ struct pending_dir_move { struct waiting_dir_move { struct rb_node node; u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; }; struct name_cache_entry { @@ -214,6 +273,11 @@ struct name_cache_entry { static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + static int need_send_hole(struct send_ctx *sctx) { return (sctx->parent_root && !sctx->cur_inode_new && @@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void) if (!p) return NULL; p->reversed = 0; - p->virtual_mem = 0; p->buf = p->inline_buf; p->buf_len = FS_PATH_INLINE_SIZE; fs_path_reset(p); @@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p) { if (!p) return; - if (p->buf != p->inline_buf) { - if (p->virtual_mem) - vfree(p->buf); - else - kfree(p->buf); - } + if (p->buf != p->inline_buf) + kfree(p->buf); kfree(p); } @@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) path_len = p->end - p->start; old_buf_len = p->buf_len; - len = PAGE_ALIGN(len); - - if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - p->virtual_mem = 1; - } - memcpy(tmp_buf, p->buf, p->buf_len); - p->buf = tmp_buf; - p->buf_len = len; - } else { - if (p->virtual_mem) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - vfree(p->buf); - } else { - tmp_buf = krealloc(p->buf, len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - kfree(p->buf); - p->virtual_mem = 1; - } - } - p->buf = tmp_buf; - p->buf_len = len; - } + + /* + * First time the inline_buf does not suffice + */ + if (p->buf == p->inline_buf) + tmp_buf = kmalloc(len, GFP_NOFS); + else + tmp_buf = krealloc(p->buf, len, GFP_NOFS); + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; p->end = p->buf + p->buf_len - 1; @@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) return 0; } -static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) { int ret; int new_len; @@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len) if (p->start != p->end) *--p->start = '/'; p->start -= name_len; - p->prepared = p->start; + *prepared = p->start; } else { if (p->start != p->end) *p->end++ = '/'; - p->prepared = p->end; + *prepared = p->end; p->end += name_len; *p->end = 0; } @@ -370,12 +413,12 @@ out: static int fs_path_add(struct fs_path *p, const char *name, int name_len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, name_len); + ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, name, name_len); - p->prepared = NULL; + memcpy(prepared, name, name_len); out: return ret; @@ -384,12 +427,12 @@ out: static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, p2->end - p2->start); + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, p2->start, p2->end - p2->start); - p->prepared = NULL; + memcpy(prepared, p2->start, p2->end - p2->start); out: return ret; @@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, unsigned long off, int len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, len); + ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) goto out; - read_extent_buffer(eb, p->prepared, off, len); - p->prepared = NULL; + read_extent_buffer(eb, prepared, off, len); out: return ret; @@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - char *buf2 = NULL; - int buf_len; - int buf_virtual = 0; + const int buf_len = PATH_MAX; u32 name_len; u32 data_len; u32 cur; @@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - buf_len = PAGE_SIZE; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); + /* + * Path too long + */ if (name_len + data_len > buf_len) { - buf_len = PAGE_ALIGN(name_len + data_len); - if (buf_virtual) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - vfree(buf); - } else { - buf2 = krealloc(buf, buf_len, GFP_NOFS); - if (!buf2) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - kfree(buf); - buf_virtual = 1; - } - } - - buf = buf2; - buf2 = NULL; + ret = -ENAMETOOLONG; + goto out; } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - if (buf_virtual) - vfree(buf); - else - kfree(buf); + kfree(buf); return ret; } @@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = logical - found_key.objectid; else extent_item_pos = 0; - - extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, backref_ctx); @@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx, while (1) { len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); - if (len >= sizeof(tmp)) { - /* should really not happen */ - ret = -EOVERFLOW; - goto out; - } + ASSERT(len < sizeof(tmp)); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx, nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)nce->ino); - BUG_ON(!nce_head); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; - if (list_empty(nce_head)) { + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } @@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx) */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, - int skip_name_cache, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) @@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; - if (skip_name_cache) - goto get_ref; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes @@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } -get_ref: /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ - if (ino < sctx->send_progress && !skip_name_cache) + if (ino < sctx->send_progress) ret = get_first_ref(sctx->send_root, ino, parent_ino, parent_gen, dest); else @@ -2062,8 +2078,6 @@ get_ref: goto out; ret = 1; } - if (skip_name_cache) - goto out; out_cache: /* @@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; - u64 start_ino = ino; - u64 start_gen = gen; - int skip_name_cache = 0; name = fs_path_alloc(); if (!name) { @@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, goto out; } - if (is_waiting_for_move(sctx, ino)) - skip_name_cache = 1; - -again: dest->reversed = 1; fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, - &parent_inode, &parent_gen, name); + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + if (ret < 0) goto out; - if (ret) - stop = 1; - - if (!skip_name_cache && - is_waiting_for_move(sctx, parent_inode)) { - ino = start_ino; - gen = start_gen; - stop = 0; - skip_name_cache = 1; - goto again; - } ret = fs_path_add_path(dest, name); if (ret < 0) @@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, - NULL, &rdev); - if (ret < 0) - goto out; + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; @@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - if (!ret) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; } - if (ret || found_key.objectid != key.objectid || + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; goto out; @@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) goto out; } - key.offset = found_key.offset + 1; - btrfs_release_path(path); + path->slots[0]++; } out: @@ -2590,7 +2617,7 @@ struct recorded_ref { * everything mixed. So we first record all refs and later process them. * This function is a helper to record one ref. */ -static int record_ref(struct list_head *head, u64 dir, +static int __record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; @@ -2676,12 +2703,78 @@ out: return ret; } +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + /* * Returns 1 if a directory can be removed at this point in time. * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) { int ret = 0; struct btrfs_root *root = sctx->parent_root; @@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (!ret) { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; } - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) break; - } di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + if (loc.objectid > send_progress) { ret = 0; goto out; } - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } ret = 1; @@ -2740,19 +2854,9 @@ out: static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) { - struct rb_node *n = sctx->waiting_dir_moves.rb_node; - struct waiting_dir_move *entry; + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); - while (n) { - entry = rb_entry(n, struct waiting_dir_move, node); - if (ino < entry->ino) - n = n->rb_left; - else if (ino > entry->ino) - n = n->rb_right; - else - return 1; - } - return 0; + return entry != NULL; } static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) @@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) if (!dm) return -ENOMEM; dm->ino = ino; + dm->rmdir_ino = 0; while (*p) { parent = *p; @@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return 0; } -static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) { struct rb_node *n = sctx->waiting_dir_moves.rb_node; struct waiting_dir_move *entry; while (n) { entry = rb_entry(n, struct waiting_dir_move, node); - if (ino < entry->ino) { + if (ino < entry->ino) n = n->rb_left; - } else if (ino > entry->ino) { + else if (ino > entry->ino) n = n->rb_right; - } else { - rb_erase(&entry->node, &sctx->waiting_dir_moves); - kfree(entry); - return 0; - } + else + return entry; } - return -ENOENT; + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); } -static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; - struct pending_dir_move *entry, *pm; + struct pending_dir_move *entry = NULL, *pm; struct recorded_ref *cur; int exists = 0; int ret; @@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) if (!pm) return -ENOMEM; pm->parent_ino = parent_ino; - pm->ino = sctx->cur_ino; - pm->gen = sctx->cur_inode_gen; + pm->ino = ino; + pm->gen = ino_gen; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; struct fs_path *to_path = NULL; + struct fs_path *name = NULL; u64 orig_progress = sctx->send_progress; struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; int ret; + name = fs_path_alloc(); from_path = fs_path_alloc(); - if (!from_path) - return -ENOMEM; + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } - sctx->send_progress = pm->ino; - ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); if (ret < 0) goto out; + if (parent_ino == sctx->cur_ino) { + /* child only renamed, not moved */ + ASSERT(parent_gen == sctx->cur_inode_gen); + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + } else { + /* child moved and maybe renamed too */ + sctx->send_progress = pm->ino; + ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + if (ret < 0) + goto out; + } + + fs_path_free(name); + name = NULL; + to_path = fs_path_alloc(); if (!to_path) { ret = -ENOMEM; @@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } sctx->send_progress = sctx->cur_ino + 1; - ret = del_waiting_dir_move(sctx, pm->ino); - ASSERT(ret == 0); - ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; @@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: ret = send_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } out: + fs_path_free(name); fs_path_free(from_path); fs_path_free(to_path); sctx->send_progress = orig_progress; @@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx, int ret; u64 ino = parent_ref->dir; u64 parent_ino_before, parent_ino_after; - u64 new_gen, old_gen; + u64 old_gen; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; - - if (parent_ref->dir <= sctx->cur_ino) - return 0; + int register_upper_dirs; + u64 gen; if (is_waiting_for_move(sctx, ino)) return 1; + if (parent_ref->dir <= sctx->cur_ino) + return 0; + ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, NULL, NULL, NULL, NULL); if (ret == -ENOENT) @@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, else if (ret < 0) return ret; - ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, - NULL, NULL, NULL, NULL); - if (ret < 0) - return ret; - - if (new_gen != old_gen) + if (parent_ref->dir_gen != old_gen) return 0; path_before = fs_path_alloc(); @@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, } ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - NULL, path_after); + &gen, path_after); if (ret == -ENOENT) { ret = 0; goto out; @@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx, len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); - if ((parent_ino_before != parent_ino_after) && (len1 != len2 || - memcmp(path_before->start, path_after->start, len1))) { + if (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1)) { ret = 1; goto out; } ret = 0; + /* + * Ok, our new most direct ancestor has a higher inode number but + * wasn't moved/renamed. So maybe some of the new ancestors higher in + * the hierarchy have an higher inode number too *and* were renamed + * or moved - in this case we need to wait for the ancestor's rename + * or move operation before we can do the move/rename for the current + * inode. + */ + register_upper_dirs = 0; + ino = parent_ino_after; +again: + while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { + u64 parent_gen; + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + &parent_gen, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret == -ENOENT) { + ret = 0; + break; + } else if (ret < 0) { + goto out; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1)) { + ret = 1; + if (register_upper_dirs) { + break; + } else { + register_upper_dirs = 1; + ino = parent_ref->dir; + gen = parent_ref->dir_gen; + goto again; + } + } else if (register_upper_dirs) { + ret = add_pending_dir_move(sctx, ino, gen, + parent_ino_after); + if (ret < 0 && ret != -EEXIST) + goto out; + } + + ino = parent_ino_after; + gen = parent_gen; + } + out: fs_path_free(path_before); fs_path_free(path_after); @@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_gen; int did_overwrite = 0; int is_orphan = 0; + u64 last_dir_ino_rm = 0; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - if (wait_for_parent_move(sctx, cur)) { + ret = wait_for_parent_move(sctx, cur); + if (ret < 0) + goto out; + if (ret) { ret = add_pending_dir_move(sctx, - cur->dir); + sctx->cur_ino, + sctx->cur_inode_gen, + cur->dir); *pending_move = 1; } else { ret = send_rename(sctx, valid_path, @@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; + last_dir_ino_rm = cur->dir; } } } @@ -3375,9 +3613,8 @@ out: return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (ret < 0) goto out; - ret = record_ref(&sctx->new_refs, dir, gen, p); + ret = __record_ref(refs, dir, gen, p); out: if (ret) @@ -3408,37 +3645,23 @@ out: return ret; } +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { - int ret = 0; struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, dir, gen, p); - if (ret < 0) - goto out; - ret = fs_path_add_path(p, name); - if (ret < 0) - goto out; - - ret = record_ref(&sctx->deleted_refs, dir, gen, p); - -out: - if (ret) - fs_path_free(p); - return ret; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx, root = sctx->parent_root; cb = __record_deleted_ref; } else { - BUG(); + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; } key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) - break; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx, break; ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); - btrfs_release_path(path); if (ret < 0) goto out; - key.offset = found_key.offset + 1; + path->slots[0]++; } btrfs_release_path(path); @@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; @@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) goto out; last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_CACHE_SIZE - pg_offset); @@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = apply_children_dir_moves(sctx); if (ret) goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; } - /* - * Need to send that every time, no matter if it actually - * changed between the two trees as we have done changes to - * the inode before. - */ - sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); - if (ret < 0) - goto out; - out: return ret; } @@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { @@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; @@ -5118,6 +5367,7 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; @@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; +join_trans: + /* + * We need to make sure the transaction does not get committed + * while we do anything on commit roots. Join a transaction to prevent + * this. + */ + trans = btrfs_join_transaction(send_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + /* * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. @@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { + /* + * When someone want to commit while we iterate, end the + * joined transaction and rejoin. + */ + if (btrfs_should_end_transaction(trans, send_root)) { + ret = btrfs_end_transaction(trans, send_root); + trans = NULL; + if (ret < 0) + goto out; + btrfs_release_path(path); + goto join_trans; + } + eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -5189,6 +5465,12 @@ out_finish: out: btrfs_free_path(path); + if (trans) { + if (!ret) + ret = btrfs_end_transaction(trans, send_root); + else + btrfs_end_transaction(trans, send_root); + } return ret; } @@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); @@ -5477,6 +5760,16 @@ out: kfree(dm); } + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + if (sort_clone_roots) { for (i = 0; i < sctx->clone_roots_cnt; i++) btrfs_root_dec_send_in_progress( diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d04db817be5..d4878ddba87 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1305,13 +1305,6 @@ error_fs_info: return ERR_PTR(error); } -static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) -{ - spin_lock_irq(&workers->lock); - workers->max_workers = new_limit; - spin_unlock_irq(&workers->lock); -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); - btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->workers, new_pool_size); - btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, - new_pool_size); + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, + new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, + new_pool_size); } static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) @@ -1479,6 +1471,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } out: + wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); return 0; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 865f4cf9a76..c5eb2143dc6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -24,6 +24,7 @@ #include <linux/kobject.h> #include <linux/bug.h> #include <linux/genhd.h> +#include <linux/debugfs.h> #include "ctree.h" #include "disk-io.h" @@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; +/* /sys/kernel/debug/btrfs */ +static struct dentry *btrfs_debugfs_root_dentry; + +/* Debugging tunables and exported data */ +u64 btrfs_debugfs_test; + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; @@ -642,27 +649,41 @@ failure: return error; } +static int btrfs_init_debugfs(void) +{ +#ifdef CONFIG_DEBUG_FS + btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); + if (!btrfs_debugfs_root_dentry) + return -ENOMEM; + + debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry, + &btrfs_debugfs_test); +#endif + return 0; +} + int btrfs_init_sysfs(void) { int ret; + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; - init_feature_attrs(); + ret = btrfs_init_debugfs(); + if (ret) + return ret; + init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); - if (ret) { - kset_unregister(btrfs_kset); - return ret; - } - return 0; + return ret; } void btrfs_exit_sysfs(void) { sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); + debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f3cea3710d4..9ab576318a8 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,6 +1,11 @@ #ifndef _BTRFS_SYSFS_H_ #define _BTRFS_SYSFS_H_ +/* + * Data exported through sysfs + */ +extern u64 btrfs_debugfs_test; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 34cd83184c4..a04707f740d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - if (--trans->use_count) { + if (trans->use_count > 1) { + trans->use_count--; trans->block_rsv = trans->orig_rsv; return 0; } @@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) { - /* - * We may race with somebody else here so end up having - * to call end_transaction on ourselves again, so inc - * our use_count. - */ - trans->use_count++; + if (throttle) return btrfs_commit_transaction(trans, root); - } else { + else wake_up_process(info->transaction_kthread); - } } if (trans->type & __TRANS_FREEZABLE) @@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, trace_btrfs_transaction_commit(root); - btrfs_scrub_continue(root); - if (current->journal_info == trans) current->journal_info = NULL; + btrfs_scrub_cancel(root->fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } @@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - return btrfs_start_delalloc_roots(fs_info, 1); + return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } @@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, /* ->aborted might be set after the previous check, so check it */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + goto scrub_continue; } /* * the reloc mutex makes sure that we stop @@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = create_pending_snapshots(trans, root->fs_info); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_items(trans, root); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = cur_trans->aborted; mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } btrfs_prepare_extent_commit(trans, root); @@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = write_ctree_super(trans, root, 0); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; +scrub_continue: + btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39d83da03e0..e2f45fc0261 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; - int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { + ret = -EAGAIN; + goto out; + } + if (!root->log_start_pid) { root->log_start_pid = current->pid; root->log_multiple_pids = false; @@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } mutex_unlock(&root->log_mutex); return 0; } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; + + ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { + if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); - if (ret) - err = ret; - } - if (err == 0 && !root->log_root) { + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + + if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) - err = ret; + goto out; } - mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_multiple_pids = false; + root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } +out: mutex_unlock(&root->log_mutex); - return err; + return ret; } /* @@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + } while (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])); - return 0; } static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) { + + while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } } +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); +} + /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, @@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, struct btrfs_log_ctx *ctx) { int index1; int index2; @@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; mutex_lock(&root->log_mutex); - log_transid = root->log_transid; - index1 = root->log_transid % 2; + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(&root->log_mutex); - return 0; + return ctx->log_ret; } + ASSERT(log_transid == root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); + while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ @@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; mutex_unlock(&root->log_mutex); goto out; } @@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); + btrfs_init_log_ctx(&root_log_ctx); + mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); @@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + blk_finish_plug(&plug); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } - root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = log_root_tree->log_transid % 2; + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + root_log_ctx.log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = 0; + ret = root_log_ctx.log_ret; goto out; } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); + root_log_ctx.log_transid - 1); } wait_for_writer(trans, log_root_tree); @@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; - smp_mb(); - mutex_unlock(&log_root_tree->log_mutex); /* @@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); + mutex_unlock(&log_root_tree->log_mutex); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + mutex_lock(&root->log_mutex); + root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); - smp_mb(); + mutex_unlock(&root->log_mutex); + if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path) + struct extent_map *em, struct btrfs_path *path, + struct list_head *logged_list) { struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; @@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; int extent_inserted = 0; @@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * First check and see if our csums are on our outstanding ordered * extents. */ -again: - spin_lock_irq(&log->log_extents_lock[index]); - list_for_each_entry(ordered, &log->logged_list[index], log_list) { + list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; if (!mod_len) break; - if (ordered->inode != inode) - continue; - if (ordered->file_offset + ordered->len <= mod_start || mod_start + mod_len <= ordered->file_offset) continue; @@ -3632,12 +3720,6 @@ again: if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) continue; - atomic_inc(&ordered->refs); - spin_unlock_irq(&log->log_extents_lock[index]); - /* - * we've dropped the lock, we must either break or - * start over after this. - */ if (ordered->csum_bytes_left) { btrfs_start_ordered_extent(inode, ordered, 0); @@ -3647,16 +3729,11 @@ again: list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) { - btrfs_put_ordered_extent(ordered); + if (ret) goto unlocked; - } } - btrfs_put_ordered_extent(ordered); - goto again; } - spin_unlock_irq(&log->log_extents_lock[index]); unlocked: if (!mod_len || ret) @@ -3694,7 +3771,8 @@ unlocked: static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct list_head *logged_list) { struct extent_map *em, *n; struct list_head extents; @@ -3752,7 +3830,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path); + ret = log_one_extent(trans, inode, root, em, path, logged_list); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(log, inode); + btrfs_get_logged_extents(inode, &logged_list); /* * a brute force approach to making sure we get the most uptodate @@ -3962,7 +4041,8 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list); if (ret) { err = ret; goto out_unlock; @@ -3987,8 +4067,10 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: - if (err) - btrfs_free_logged_extents(log, log->log_transid); + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -4079,7 +4161,8 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) + struct dentry *parent, int exists_only, + struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = start_log_trans(trans, root); + ret = start_log_trans(trans, root, ctx); if (ret) - goto end_trans; + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) @@ -4166,6 +4249,9 @@ end_trans: root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } + + if (ret) + btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); end_no_trans: return ret; @@ -4178,12 +4264,14 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + 0, ctx); dput(parent); return ret; @@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1); + return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1d4ae0d15a7..91b145fce33 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,28 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +struct btrfs_log_ctx { + int log_ret; + int log_transid; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + INIT_LIST_HEAD(&ctx->list); +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bab0b84d8f8..d241130a32f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -415,7 +415,8 @@ loop_lock: device->running_pending = 1; spin_unlock(&device->io_lock); - btrfs_requeue_work(&device->work); + btrfs_queue_work(fs_info->submit_workers, + &device->work); goto done; } /* unplug every 64 requests just for good measure */ @@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err) if (bio == bbio->orig_bio) is_orig_bio = 1; + btrfs_bio_counter_dec(bbio->fs_info); + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); @@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err) } } -struct async_sched { - struct bio *bio; - int rw; - struct btrfs_fs_info *info; - struct btrfs_work work; -}; - /* * see run_scheduled_bios for a description of why bios are collected for * async submit. @@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, spin_unlock(&device->io_lock); if (should_queue) - btrfs_queue_worker(&root->fs_info->submit_workers, - &device->work); + btrfs_queue_work(root->fs_info->submit_workers, + &device->work); } static int bio_size_ok(struct block_device *bdev, struct bio *bio, @@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, } #endif bio->bi_bdev = dev->bdev; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + if (async) btrfs_schedule_bio(root, dev, rw, bio); else @@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; + btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num, &raid_map); - if (ret) /* -ENOMEM */ + if (ret) { + btrfs_bio_counter_dec(root->fs_info); return ret; + } total_devs = bbio->num_stripes; bbio->orig_bio = first_bio; bbio->private = first_bio->bi_private; bbio->end_io = first_bio->bi_end_io; + bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); if (raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - return raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, + raid_map, map_length); } else { - return raid56_parity_recover(root, bio, bbio, - raid_map, map_length, - mirror_num); + ret = raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); } + /* + * FIXME, replace dosen't support raid56 yet, please fix + * it in the future. + */ + btrfs_bio_counter_dec(root->fs_info); + return ret; } if (map_length < length) { @@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, async_submit); dev_nr++; } + btrfs_bio_counter_dec(root->fs_info); return 0; } @@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - dev->work.func = pending_bios_fn; + btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); return dev; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8b3cd142b37..80754f9dd3d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { atomic_t stripes_pending; + struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; void *private; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 3176cdc3293..4ee4e30d26d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -21,6 +21,8 @@ struct btrfs_block_group_cache; struct btrfs_free_cluster; struct map_lookup; struct extent_buffer; +struct btrfs_work; +struct __btrfs_workqueue; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -982,6 +984,141 @@ TRACE_EVENT(free_extent_state, (void *)__entry->ip) ); +DECLARE_EVENT_CLASS(btrfs__work, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, wq ) + __field( void *, func ) + __field( void *, ordered_func ) + __field( void *, ordered_free ) + ), + + TP_fast_assign( + __entry->work = work; + __entry->wq = work->wq; + __entry->func = work->func; + __entry->ordered_func = work->ordered_func; + __entry->ordered_free = work->ordered_free; + ), + + TP_printk("work=%p, wq=%p, func=%p, ordered_func=%p, ordered_free=%p", + __entry->work, __entry->wq, __entry->func, + __entry->ordered_func, __entry->ordered_free) +); + +/* For situiations that the work is freed */ +DECLARE_EVENT_CLASS(btrfs__work__done, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + ), + + TP_fast_assign( + __entry->work = work; + ), + + TP_printk("work->%p", __entry->work) +); + +DEFINE_EVENT(btrfs__work, btrfs_work_queued, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work) +); + +DEFINE_EVENT(btrfs__work, btrfs_work_sched, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work) +); + +DEFINE_EVENT(btrfs__work, btrfs_normal_work_done, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work) +); + +DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work) +); + +DEFINE_EVENT(btrfs__work, btrfs_ordered_sched, + + TP_PROTO(struct btrfs_work *work), + + TP_ARGS(work) +); + +DECLARE_EVENT_CLASS(btrfs__workqueue, + + TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high), + + TP_ARGS(wq, name, high), + + TP_STRUCT__entry( + __field( void *, wq ) + __string( name, name ) + __field( int , high ) + ), + + TP_fast_assign( + __entry->wq = wq; + __assign_str(name, name); + __entry->high = high; + ), + + TP_printk("name=%s%s, wq=%p", __get_str(name), + __print_flags(__entry->high, "", + {(WQ_HIGHPRI), "-high"}), + __entry->wq) +); + +DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc, + + TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high), + + TP_ARGS(wq, name, high) +); + +DECLARE_EVENT_CLASS(btrfs__workqueue_done, + + TP_PROTO(struct __btrfs_workqueue *wq), + + TP_ARGS(wq), + + TP_STRUCT__entry( + __field( void *, wq ) + ), + + TP_fast_assign( + __entry->wq = wq; + ), + + TP_printk("wq=%p", __entry->wq) +); + +DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, + + TP_PROTO(struct __btrfs_workqueue *wq), + + TP_ARGS(wq) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ |