aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c7
-rw-r--r--fs/btrfs/async-thread.c850
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c158
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/check-integrity.c9
-rw-r--r--fs/btrfs/compression.c12
-rw-r--r--fs/btrfs/ctree.c209
-rw-r--r--fs/btrfs/ctree.h254
-rw-r--r--fs/btrfs/delayed-inode.c13
-rw-r--r--fs/btrfs/delayed-ref.c68
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/dev-replace.c86
-rw-r--r--fs/btrfs/disk-io.c435
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c718
-rw-r--r--fs/btrfs/extent_io.c476
-rw-r--r--fs/btrfs/extent_io.h7
-rw-r--r--fs/btrfs/extent_map.c58
-rw-r--r--fs/btrfs/extent_map.h11
-rw-r--r--fs/btrfs/file-item.c80
-rw-r--r--fs/btrfs/file.c378
-rw-r--r--fs/btrfs/free-space-cache.c420
-rw-r--r--fs/btrfs/inode-map.c40
-rw-r--r--fs/btrfs/inode.c542
-rw-r--r--fs/btrfs/ioctl.c825
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/lzo.c14
-rw-r--r--fs/btrfs/ordered-data.c81
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/print-tree.c9
-rw-r--r--fs/btrfs/qgroup.c954
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/raid56.c26
-rw-r--r--fs/btrfs/reada.c13
-rw-r--r--fs/btrfs/relocation.c44
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/scrub.c233
-rw-r--r--fs/btrfs/send.c1022
-rw-r--r--fs/btrfs/super.c111
-rw-r--r--fs/btrfs/sysfs.c125
-rw-r--r--fs/btrfs/sysfs.h9
-rw-r--r--fs/btrfs/tests/btrfs-tests.c97
-rw-r--r--fs/btrfs/tests/btrfs-tests.h9
-rw-r--r--fs/btrfs/tests/inode-tests.c35
-rw-r--r--fs/btrfs/tests/qgroup-tests.c470
-rw-r--r--fs/btrfs/transaction.c208
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c253
-rw-r--r--fs/btrfs/tree-log.h34
-rw-r--r--fs/btrfs/volumes.c271
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zlib.c26
56 files changed, 6668 insertions, 3415 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f341a98031d..6d1d0b93b1a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff9b3995d45..9a0124a9585 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
const char *name;
char *value = NULL;
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret < 0)
- return ret;
- ret = 0;
- }
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9c..5a201d81049 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -21,708 +22,315 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <linux/workqueue.h>
#include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+ struct workqueue_struct *normal_wq;
+ /* List head pointing to ordered work list */
+ struct list_head ordered_list;
+
+ /* Spinlock for ordered_list */
+ spinlock_t list_lock;
+
+ /* Thresholding related variants */
+ atomic_t pending;
+ int max_active;
+ int current_max;
+ int thresh;
+ unsigned int count;
+ spinlock_t thres_lock;
+};
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
-
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
- /* pool we belong to */
- struct btrfs_workers *workers;
-
- /* list of struct btrfs_work that are waiting for service */
- struct list_head pending;
- struct list_head prio_pending;
-
- /* list of worker threads from struct btrfs_workers */
- struct list_head worker_list;
-
- /* kthread */
- struct task_struct *task;
+struct btrfs_workqueue {
+ struct __btrfs_workqueue *normal;
+ struct __btrfs_workqueue *high;
+};
- /* number of things on the pending list */
- atomic_t num_pending;
+static inline struct __btrfs_workqueue
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+ int thresh)
+{
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- /* reference counter for this struct */
- atomic_t refs;
+ if (unlikely(!ret))
+ return NULL;
- unsigned long sequence;
+ ret->max_active = max_active;
+ atomic_set(&ret->pending, 0);
+ if (thresh == 0)
+ thresh = DFT_THRESHOLD;
+ /* For low threshold, disabling threshold is a better choice */
+ if (thresh < DFT_THRESHOLD) {
+ ret->current_max = max_active;
+ ret->thresh = NO_THRESHOLD;
+ } else {
+ ret->current_max = 1;
+ ret->thresh = thresh;
+ }
- /* protects the pending list. */
- spinlock_t lock;
+ if (flags & WQ_HIGHPRI)
+ ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+ ret->max_active,
+ "btrfs", name);
+ else
+ ret->normal_wq = alloc_workqueue("%s-%s", flags,
+ ret->max_active, "btrfs",
+ name);
+ if (unlikely(!ret->normal_wq)) {
+ kfree(ret);
+ return NULL;
+ }
- /* set to non-zero when this thread is already awake and kicking */
- int working;
+ INIT_LIST_HEAD(&ret->ordered_list);
+ spin_lock_init(&ret->list_lock);
+ spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+ return ret;
+}
- /* are we currently idle */
- int idle;
-};
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-static int __btrfs_start_workers(struct btrfs_workers *workers);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh)
+{
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time. It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations. This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
- struct btrfs_work work;
- struct btrfs_workers *queue;
-};
+ if (unlikely(!ret))
+ return NULL;
-static void start_new_worker_func(struct btrfs_work *work)
-{
- struct worker_start *start;
- start = container_of(work, struct worker_start, work);
- __btrfs_start_workers(start->queue);
- kfree(start);
-}
+ ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ max_active, thresh);
+ if (unlikely(!ret->normal)) {
+ kfree(ret);
+ return NULL;
+ }
-/*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
- */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
-{
- if (!worker->idle && atomic_read(&worker->num_pending) <
- worker->workers->idle_thresh / 2) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 1;
-
- /* the list may be empty if the worker is just starting */
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move(&worker->worker_list,
- &worker->workers->idle_list);
+ if (flags & WQ_HIGHPRI) {
+ ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ thresh);
+ if (unlikely(!ret->high)) {
+ __btrfs_destroy_workqueue(ret->normal);
+ kfree(ret);
+ return NULL;
}
- spin_unlock_irqrestore(&worker->workers->lock, flags);
}
+ return ret;
}
/*
- * helper function to move a thread off the idle list after new
- * pending work is added.
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
*/
-static void check_busy_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
{
- if (worker->idle && atomic_read(&worker->num_pending) >=
- worker->workers->idle_thresh) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 0;
-
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- }
- spin_unlock_irqrestore(&worker->workers->lock, flags);
- }
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+ atomic_inc(&wq->pending);
}
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- struct btrfs_workers *workers = worker->workers;
- struct worker_start *start;
- unsigned long flags;
-
- rmb();
- if (!workers->atomic_start_pending)
- return;
+ int new_max_active;
+ long pending;
+ int need_change = 0;
- start = kzalloc(sizeof(*start), GFP_NOFS);
- if (!start)
+ if (wq->thresh == NO_THRESHOLD)
return;
- start->work.func = start_new_worker_func;
- start->queue = workers;
-
- spin_lock_irqsave(&workers->lock, flags);
- if (!workers->atomic_start_pending)
- goto out;
-
- workers->atomic_start_pending = 0;
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers)
- goto out;
-
- workers->num_workers_starting += 1;
- spin_unlock_irqrestore(&workers->lock, flags);
- btrfs_queue_worker(workers->atomic_worker_start, &start->work);
- return;
+ atomic_dec(&wq->pending);
+ spin_lock(&wq->thres_lock);
+ /*
+ * Use wq->count to limit the calling frequency of
+ * workqueue_set_max_active.
+ */
+ wq->count++;
+ wq->count %= (wq->thresh / 4);
+ if (!wq->count)
+ goto out;
+ new_max_active = wq->current_max;
+ /*
+ * pending may be changed later, but it's OK since we really
+ * don't need it so accurate to calculate new_max_active.
+ */
+ pending = atomic_read(&wq->pending);
+ if (pending > wq->thresh)
+ new_max_active++;
+ if (pending < wq->thresh / 2)
+ new_max_active--;
+ new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+ if (new_max_active != wq->current_max) {
+ need_change = 1;
+ wq->current_max = new_max_active;
+ }
out:
- kfree(start);
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&wq->thres_lock);
+
+ if (need_change) {
+ workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ }
}
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
{
- if (!workers->ordered)
- return;
-
- set_bit(WORK_DONE_BIT, &work->flags);
-
- spin_lock(&workers->order_lock);
+ struct list_head *list = &wq->ordered_list;
+ struct btrfs_work *work;
+ spinlock_t *lock = &wq->list_lock;
+ unsigned long flags;
while (1) {
- if (!list_empty(&workers->prio_order_list)) {
- work = list_entry(workers->prio_order_list.next,
- struct btrfs_work, order_list);
- } else if (!list_empty(&workers->order_list)) {
- work = list_entry(workers->order_list.next,
- struct btrfs_work, order_list);
- } else {
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(list))
break;
- }
+ work = list_entry(list->next, struct btrfs_work,
+ ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
- /* we are going to call the ordered done function, but
+ /*
+ * we are going to call the ordered done function, but
* we leave the work item on the list as a barrier so
* that later work items that are done don't have their
* functions called before this one returns
*/
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
-
- spin_unlock(&workers->order_lock);
-
+ trace_btrfs_ordered_sched(work);
+ spin_unlock_irqrestore(lock, flags);
work->ordered_func(work);
/* now take the lock again and drop our item from the list */
- spin_lock(&workers->order_lock);
- list_del(&work->order_list);
- spin_unlock(&workers->order_lock);
+ spin_lock_irqsave(lock, flags);
+ list_del(&work->ordered_list);
+ spin_unlock_irqrestore(lock, flags);
/*
* we don't want to call the ordered free functions
* with the lock held though
*/
work->ordered_free(work);
- spin_lock(&workers->order_lock);
- }
-
- spin_unlock(&workers->order_lock);
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
- if (atomic_dec_and_test(&worker->refs))
- kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
- int freeit = 0;
-
- spin_lock_irq(&worker->lock);
- spin_lock(&worker->workers->lock);
- if (worker->workers->num_workers > 1 &&
- worker->idle &&
- !worker->working &&
- !list_empty(&worker->worker_list) &&
- list_empty(&worker->prio_pending) &&
- list_empty(&worker->pending) &&
- atomic_read(&worker->num_pending) == 0) {
- freeit = 1;
- list_del_init(&worker->worker_list);
- worker->workers->num_workers--;
+ trace_btrfs_all_work_done(work);
}
- spin_unlock(&worker->workers->lock);
- spin_unlock_irq(&worker->lock);
-
- if (freeit)
- put_worker(worker);
- return freeit;
+ spin_unlock_irqrestore(lock, flags);
}
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
- struct list_head *prio_head,
- struct list_head *head)
+static void normal_work_helper(struct work_struct *arg)
{
- struct btrfs_work *work = NULL;
- struct list_head *cur = NULL;
-
- if (!list_empty(prio_head))
- cur = prio_head->next;
-
- smp_mb();
- if (!list_empty(&worker->prio_pending))
- goto refill;
-
- if (!list_empty(head))
- cur = head->next;
-
- if (cur)
- goto out;
-
-refill:
- spin_lock_irq(&worker->lock);
- list_splice_tail_init(&worker->prio_pending, prio_head);
- list_splice_tail_init(&worker->pending, head);
-
- if (!list_empty(prio_head))
- cur = prio_head->next;
- else if (!list_empty(head))
- cur = head->next;
- spin_unlock_irq(&worker->lock);
-
- if (!cur)
- goto out_fail;
-
-out:
- work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
- return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
-{
- struct btrfs_worker_thread *worker = arg;
- struct list_head head;
- struct list_head prio_head;
struct btrfs_work *work;
+ struct __btrfs_workqueue *wq;
+ int need_order = 0;
- INIT_LIST_HEAD(&head);
- INIT_LIST_HEAD(&prio_head);
-
- do {
-again:
- while (1) {
-
-
- work = get_next_work(worker, &prio_head, &head);
- if (!work)
- break;
-
- list_del(&work->list);
- clear_bit(WORK_QUEUED_BIT, &work->flags);
-
- work->worker = worker;
-
- work->func(work);
-
- atomic_dec(&worker->num_pending);
- /*
- * unless this is an ordered work queue,
- * 'work' was probably freed by func above.
- */
- run_ordered_completions(worker->workers, work);
-
- check_pending_worker_creates(worker);
- cond_resched();
- }
-
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
-
- if (freezing(current)) {
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
- try_to_freeze();
- } else {
- spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop()) {
- cpu_relax();
- /*
- * we've dropped the lock, did someone else
- * jump_in?
- */
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- /*
- * this short schedule allows more work to
- * come in without the queue functions
- * needing to go through wake_up_process()
- *
- * worker->working is still 1, so nobody
- * is going to try and wake us up
- */
- schedule_timeout(1);
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- if (kthread_should_stop())
- break;
-
- /* still no more work?, sleep for real */
- spin_lock_irq(&worker->lock);
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending)) {
- spin_unlock_irq(&worker->lock);
- set_current_state(TASK_RUNNING);
- goto again;
- }
-
- /*
- * this makes sure we get a wakeup when someone
- * adds something new to the queue
- */
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
-
- if (!kthread_should_stop()) {
- schedule_timeout(HZ * 120);
- if (!worker->working &&
- try_worker_shutdown(worker)) {
- return 0;
- }
- }
- }
- __set_current_state(TASK_RUNNING);
- }
- } while (!kthread_should_stop());
- return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
- struct list_head *cur;
- struct btrfs_worker_thread *worker;
- int can_stop;
-
- spin_lock_irq(&workers->lock);
- workers->stopping = 1;
- list_splice_init(&workers->idle_list, &workers->worker_list);
- while (!list_empty(&workers->worker_list)) {
- cur = workers->worker_list.next;
- worker = list_entry(cur, struct btrfs_worker_thread,
- worker_list);
-
- atomic_inc(&worker->refs);
- workers->num_workers -= 1;
- if (!list_empty(&worker->worker_list)) {
- list_del_init(&worker->worker_list);
- put_worker(worker);
- can_stop = 1;
- } else
- can_stop = 0;
- spin_unlock_irq(&workers->lock);
- if (can_stop)
- kthread_stop(worker->task);
- spin_lock_irq(&workers->lock);
- put_worker(worker);
+ work = container_of(arg, struct btrfs_work, normal_work);
+ /*
+ * We should not touch things inside work in the following cases:
+ * 1) after work->func() if it has no ordered_free
+ * Since the struct is freed in work->func().
+ * 2) after setting WORK_DONE_BIT
+ * The work may be freed in other threads almost instantly.
+ * So we save the needed things here.
+ */
+ if (work->ordered_func)
+ need_order = 1;
+ wq = work->wq;
+
+ trace_btrfs_work_sched(work);
+ thresh_exec_hook(wq);
+ work->func(work);
+ if (need_order) {
+ set_bit(WORK_DONE_BIT, &work->flags);
+ run_ordered_work(wq);
}
- spin_unlock_irq(&workers->lock);
+ if (!need_order)
+ trace_btrfs_all_work_done(work);
}
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_helper)
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free)
{
- workers->num_workers = 0;
- workers->num_workers_starting = 0;
- INIT_LIST_HEAD(&workers->worker_list);
- INIT_LIST_HEAD(&workers->idle_list);
- INIT_LIST_HEAD(&workers->order_list);
- INIT_LIST_HEAD(&workers->prio_order_list);
- spin_lock_init(&workers->lock);
- spin_lock_init(&workers->order_lock);
- workers->max_workers = max;
- workers->idle_thresh = 32;
- workers->name = name;
- workers->ordered = 0;
- workers->atomic_start_pending = 0;
- workers->atomic_worker_start = async_helper;
- workers->stopping = 0;
+ work->func = func;
+ work->ordered_func = ordered_func;
+ work->ordered_free = ordered_free;
+ INIT_WORK(&work->normal_work, normal_work_helper);
+ INIT_LIST_HEAD(&work->ordered_list);
+ work->flags = 0;
}
-/*
- * starts new worker threads. This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- int ret = 0;
-
- worker = kzalloc(sizeof(*worker), GFP_NOFS);
- if (!worker) {
- ret = -ENOMEM;
- goto fail;
- }
-
- INIT_LIST_HEAD(&worker->pending);
- INIT_LIST_HEAD(&worker->prio_pending);
- INIT_LIST_HEAD(&worker->worker_list);
- spin_lock_init(&worker->lock);
-
- atomic_set(&worker->num_pending, 0);
- atomic_set(&worker->refs, 1);
- worker->workers = workers;
- worker->task = kthread_create(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + 1);
- if (IS_ERR(worker->task)) {
- ret = PTR_ERR(worker->task);
- goto fail;
- }
+ unsigned long flags;
- spin_lock_irq(&workers->lock);
- if (workers->stopping) {
- spin_unlock_irq(&workers->lock);
- ret = -EINVAL;
- goto fail_kthread;
+ work->wq = wq;
+ thresh_queue_hook(wq);
+ if (work->ordered_func) {
+ spin_lock_irqsave(&wq->list_lock, flags);
+ list_add_tail(&work->ordered_list, &wq->ordered_list);
+ spin_unlock_irqrestore(&wq->list_lock, flags);
}
- list_add_tail(&worker->worker_list, &workers->idle_list);
- worker->idle = 1;
- workers->num_workers++;
- workers->num_workers_starting--;
- WARN_ON(workers->num_workers_starting < 0);
- spin_unlock_irq(&workers->lock);
-
- wake_up_process(worker->task);
- return 0;
-
-fail_kthread:
- kthread_stop(worker->task);
-fail:
- kfree(worker);
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting--;
- spin_unlock_irq(&workers->lock);
- return ret;
+ queue_work(wq->normal_wq, &work->normal_work);
+ trace_btrfs_work_queued(work);
}
-int btrfs_start_workers(struct btrfs_workers *workers)
-{
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting++;
- spin_unlock_irq(&workers->lock);
- return __btrfs_start_workers(workers);
-}
-
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now. This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- struct list_head *next;
- int enforce_min;
+ struct __btrfs_workqueue *dest_wq;
- enforce_min = (workers->num_workers + workers->num_workers_starting) <
- workers->max_workers;
-
- /*
- * if we find an idle thread, don't move it to the end of the
- * idle list. This improves the chance that the next submission
- * will reuse the same thread, and maybe catch it while it is still
- * working
- */
- if (!list_empty(&workers->idle_list)) {
- next = workers->idle_list.next;
- worker = list_entry(next, struct btrfs_worker_thread,
- worker_list);
- return worker;
- }
- if (enforce_min || list_empty(&workers->worker_list))
- return NULL;
-
- /*
- * if we pick a busy task, move the task to the end of the list.
- * hopefully this will keep things somewhat evenly balanced.
- * Do the move in batches based on the sequence number. This groups
- * requests submitted at roughly the same time onto the same worker.
- */
- next = workers->worker_list.next;
- worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- worker->sequence++;
-
- if (worker->sequence % workers->idle_thresh == 0)
- list_move_tail(next, &workers->worker_list);
- return worker;
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+ dest_wq = wq->high;
+ else
+ dest_wq = wq->normal;
+ __btrfs_queue_work(dest_wq, work);
}
-/*
- * selects a worker thread to take the next job. This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- struct list_head *fallback;
- int ret;
-
- spin_lock_irqsave(&workers->lock, flags);
-again:
- worker = next_worker(workers);
-
- if (!worker) {
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers) {
- goto fallback;
- } else if (workers->atomic_worker_start) {
- workers->atomic_start_pending = 1;
- goto fallback;
- } else {
- workers->num_workers_starting++;
- spin_unlock_irqrestore(&workers->lock, flags);
- /* we're below the limit, start another worker */
- ret = __btrfs_start_workers(workers);
- spin_lock_irqsave(&workers->lock, flags);
- if (ret)
- goto fallback;
- goto again;
- }
- }
- goto found;
-
-fallback:
- fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the first one we can find.
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
-found:
- /*
- * this makes sure the worker doesn't exit before it is placed
- * onto a busy/idle list
- */
- atomic_inc(&worker->num_pending);
- spin_unlock_irqrestore(&workers->lock, flags);
- return worker;
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
+ kfree(wq);
}
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from. It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker = work->worker;
- unsigned long flags;
- int wake = 0;
-
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ if (!wq)
return;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
-
- /* by definition we're busy, take ourselves off the idle
- * list
- */
- if (worker->idle) {
- spin_lock(&worker->workers->lock);
- worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- spin_unlock(&worker->workers->lock);
- }
- if (!worker->working) {
- wake = 1;
- worker->working = 1;
- }
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ if (wq->high)
+ __btrfs_destroy_workqueue(wq->high);
+ __btrfs_destroy_workqueue(wq->normal);
+ kfree(wq);
}
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (!wq)
+ return;
+ wq->normal->max_active = max;
+ if (wq->high)
+ wq->high->max_active = max;
}
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- int wake = 0;
-
- /* don't requeue something already on a list */
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- return;
-
- worker = find_worker(workers);
- if (workers->ordered) {
- /*
- * you're not allowed to do ordered queues from an
- * interrupt handler
- */
- spin_lock(&workers->order_lock);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
- list_add_tail(&work->order_list,
- &workers->prio_order_list);
- } else {
- list_add_tail(&work->order_list, &workers->order_list);
- }
- spin_unlock(&workers->order_lock);
- } else {
- INIT_LIST_HEAD(&work->order_list);
- }
-
- spin_lock_irqsave(&worker->lock, flags);
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- check_busy_worker(worker);
-
- /*
- * avoid calling into wake_up_process if this thread has already
- * been kicked
- */
- if (!worker->working)
- wake = 1;
- worker->working = 1;
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683e..9c6b66d15fb 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work. There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
struct btrfs_work {
- /*
- * func should be set to the function you want called
- * your work struct is passed as the only arg
- *
- * ordered_func must be set for work sent to an ordered work queue,
- * and it is called to complete a given work item in the same
- * order they were sent to the queue.
- */
- void (*func)(struct btrfs_work *work);
- void (*ordered_func)(struct btrfs_work *work);
- void (*ordered_free)(struct btrfs_work *work);
-
- /*
- * flags should be set to zero. It is used to make sure the
- * struct is only inserted once into the list.
- */
+ btrfs_func_t func;
+ btrfs_func_t ordered_func;
+ btrfs_func_t ordered_free;
+
+ /* Don't touch things below */
+ struct work_struct normal_work;
+ struct list_head ordered_list;
+ struct __btrfs_workqueue *wq;
unsigned long flags;
-
- /* don't touch these */
- struct btrfs_worker_thread *worker;
- struct list_head list;
- struct list_head order_list;
-};
-
-struct btrfs_workers {
- /* current number of running workers */
- int num_workers;
-
- int num_workers_starting;
-
- /* max number of workers allowed. changed by btrfs_start_workers */
- int max_workers;
-
- /* once a worker has this many requests or fewer, it is idle */
- int idle_thresh;
-
- /* force completions in the order they were queued */
- int ordered;
-
- /* more workers required, but in an interrupt handler */
- int atomic_start_pending;
-
- /*
- * are we allowed to sleep while starting workers or are we required
- * to start them at a later time? If we can't sleep, this indicates
- * which queue we need to use to schedule thread creation.
- */
- struct btrfs_workers *atomic_worker_start;
-
- /* list with all the work threads. The workers on the idle thread
- * may be actively servicing jobs, but they haven't yet hit the
- * idle thresh limit above.
- */
- struct list_head worker_list;
- struct list_head idle_list;
-
- /*
- * when operating in ordered mode, this maintains the list
- * of work items waiting for completion
- */
- struct list_head order_list;
- struct list_head prio_order_list;
-
- /* lock for finding the next worker thread to queue on */
- spinlock_t lock;
-
- /* lock for the ordered lists */
- spinlock_t order_lock;
-
- /* extra name for this worker, used for current->name */
- char *name;
-
- int stopping;
};
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers);
-void btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_starter);
-void btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh);
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d..e25564bfcb4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents, struct __prelim_ref *ref,
- int level, u64 time_seq, const u64 *extent_item_pos)
+ int level, u64 time_seq, const u64 *extent_item_pos,
+ u64 total_refs)
{
int ret = 0;
int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
ret = btrfs_next_old_leaf(root, path, time_seq);
- while (!ret && count < ref->count) {
+ while (!ret && count < total_refs) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct __prelim_ref *ref,
struct ulist *parents,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -329,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- root_level = btrfs_old_root_level(root, time_seq);
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -361,7 +365,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos);
+ extent_item_pos, total_refs);
out:
path->lowest_level = 0;
btrfs_release_path(path);
@@ -374,7 +378,7 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
int err;
int ret = 0;
@@ -400,7 +404,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
if (ref->count == 0)
continue;
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos);
+ parents, extent_item_pos,
+ total_refs);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -557,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs)
+ struct list_head *prefs, u64 *total_refs)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -593,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
default:
BUG_ON(1);
}
+ *total_refs += (node->ref_mod * sgn);
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
*/
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int *info_level, struct list_head *prefs)
+ int *info_level, struct list_head *prefs,
+ u64 *total_refs)
{
int ret = 0;
int slot;
@@ -677,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -859,6 +867,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct list_head prefs;
struct __prelim_ref *ref;
struct extent_inode_elem *eie = NULL;
+ u64 total_refs = 0;
INIT_LIST_HEAD(&prefs);
INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!trans)
+ if (!trans) {
path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -889,7 +900,11 @@ again:
goto out;
BUG_ON(ret == 0);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
if (trans) {
+#endif
/*
* look if there are updates for this ref queued and lock the
* head
@@ -915,7 +930,7 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed);
+ &prefs_delayed, &total_refs);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -936,7 +951,8 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
- &info_level, &prefs);
+ &info_level, &prefs,
+ &total_refs);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +972,7 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos);
+ extent_item_pos, total_refs);
if (ret)
goto out;
@@ -965,18 +981,19 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
- if (ref->count && ref->root_id && ref->parent == 0) {
+ if (roots && ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
goto out;
}
if (ref->count && ref->parent) {
- if (extent_item_pos && !ref->inode_list) {
+ if (extent_item_pos && !ref->inode_list &&
+ ref->level == 0) {
u32 bsz;
struct extent_buffer *eb;
bsz = btrfs_level_size(fs_info->extent_root,
- info_level);
+ ref->level);
eb = read_tree_block(fs_info->extent_root,
ref->parent, bsz, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
@@ -1061,22 +1078,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos)
{
- struct ulist *tmp;
int ret;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
*leafs = ulist_alloc(GFP_NOFS);
- if (!*leafs) {
- ulist_free(tmp);
+ if (!*leafs)
return -ENOMEM;
- }
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, tmp, extent_item_pos);
- ulist_free(tmp);
-
+ time_seq, *leafs, NULL, extent_item_pos);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1098,9 +1107,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*
* returns 0 on success, < 0 on error.
*/
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1136,6 +1145,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return 0;
}
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
+{
+ int ret;
+
+ if (!trans)
+ down_read(&fs_info->commit_root_sem);
+ ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1333,38 +1356,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
if (ret < 0)
return ret;
- while (1) {
- u32 nritems;
- if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
- ret = btrfs_prev_leaf(fs_info->extent_root, path);
- if (ret != 0) {
- if (ret > 0) {
- pr_debug("logical %llu is not within "
- "any extent\n", logical);
- ret = -ENOENT;
- }
- return ret;
- }
- } else {
- path->slots[0]--;
- }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (nritems == 0) {
- pr_debug("logical %llu is not within any extent\n",
- logical);
- return -ENOENT;
- }
- if (path->slots[0] == nritems)
- path->slots[0]--;
-
- btrfs_item_key_to_cpu(path->nodes[0], found_key,
- path->slots[0]);
- if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
- found_key->type == BTRFS_METADATA_ITEM_KEY)
- break;
+ ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
}
-
+ btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
size = fs_info->extent_root->leafsize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
@@ -1411,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
* returns <0 on error
*/
static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- struct btrfs_extent_inline_ref **out_eiref,
- int *out_type)
+ struct btrfs_key *key,
+ struct btrfs_extent_item *ei, u32 item_size,
+ struct btrfs_extent_inline_ref **out_eiref,
+ int *out_type)
{
unsigned long end;
u64 flags;
@@ -1423,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
/* first call */
flags = btrfs_extent_flags(eb, ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- info = (struct btrfs_tree_block_info *)(ei + 1);
- *out_eiref =
- (struct btrfs_extent_inline_ref *)(info + 1);
+ if (key->type == BTRFS_METADATA_ITEM_KEY) {
+ /* a skinny metadata extent */
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(ei + 1);
+ } else {
+ WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(info + 1);
+ }
} else {
*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
}
*ptr = (unsigned long)*out_eiref;
- if ((void *)*ptr >= (void *)ei + item_size)
+ if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
return -ENOENT;
}
end = (unsigned long)ei + item_size;
- *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+ *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -1454,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
* <0 on error.
*/
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level)
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level)
{
int ret;
int type;
@@ -1466,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 1;
while (1) {
- ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
- &eiref, &type);
+ ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+ &eiref, &type);
if (ret < 0)
return ret;
@@ -1540,6 +1546,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ } else {
+ down_read(&fs_info->commit_root_sem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1550,8 +1558,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots);
+ ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
@@ -1573,6 +1581,8 @@ out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
+ } else {
+ up_read(&fs_info->commit_root_sem);
}
return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a910b27a8ad..86fc20fec28 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level);
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots);
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689..4794923c410 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
u64 last_trans;
/*
- * log transid when this inode was last modified
+ * transid that last logged this inode
*/
- u64 last_sub_trans;
+ u64 logged_trans;
/*
- * transid that last logged this inode
+ * log transid when this inode was last modified
*/
- u64 logged_trans;
+ int last_sub_trans;
+
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
- /* a local copy of root's last_log_commit */
- unsigned long last_log_commit;
-
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
{
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags);
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 49a62b4dda3..ce92ae30250 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -92,11 +92,11 @@
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
-#include <linux/crc32c.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "extent_io.h"
#include "volumes.h"
@@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error:
next_stack =
btrfsic_stack_frame_alloc();
if (NULL == next_stack) {
+ sf->error = -1;
btrfsic_release_block_ctx(
&sf->
next_block_ctx);
@@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame:
sf->next_block_ctx.datav[0];
next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack)
+ if (NULL == next_stack) {
+ sf->error = -1;
goto one_stack_frame_backwards;
+ }
next_stack->i = -1;
next_stack->block = sf->next_block;
@@ -1823,7 +1826,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
size_t sublen = i ? PAGE_CACHE_SIZE :
(PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
- crc = crc32c(crc, data, sublen);
+ crc = btrfs_crc32c(crc, data, sublen);
}
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e2600cdb6c2..1daea0b4718 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
- if (page) {
+ if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
@@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
spin_lock(workspace_lock);
if (*num_workspace < num_online_cpus()) {
- list_add_tail(workspace, idle_workspace);
+ list_add(workspace, idle_workspace);
(*num_workspace)++;
spin_unlock(workspace_lock);
goto wake;
@@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -1;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
@@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
@@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
@@ -1010,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+ if (*pg_index == (vcnt - 1) && *pg_offset == 0)
+ memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
kunmap_atomic(kaddr);
flush_dcache_page(page_out);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa6..aeab453b8e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
static void add_root_to_dirty_list(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
- if (root->track_dirty && list_empty(&root->dirty_list)) {
+ if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
+ list_empty(&root->dirty_list)) {
list_add(&root->dirty_list,
&root->fs_info->dirty_cowonly_roots);
}
@@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
@@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
}
/*
- * Increment the upper half of tree_mod_seq, set lower half zero.
- *
- * Must be called with fs_info->tree_mod_seq_lock held.
- */
-static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
-{
- u64 seq = atomic64_read(&fs_info->tree_mod_seq);
- seq &= 0xffffffff00000000ull;
- seq += 1ull << 32;
- atomic64_set(&fs_info->tree_mod_seq, seq);
- return seq;
-}
-
-/*
- * Increment the lower half of tree_mod_seq.
- *
- * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
- * are generated should not technically require a spin lock here. (Rationale:
- * incrementing the minor while incrementing the major seq number is between its
- * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
- * just returns a unique sequence number as usual.) We have decided to leave
- * that requirement in here and rethink it once we notice it really imposes a
- * problem on some workload.
+ * Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
- * return the last minor in the previous major tree_mod_seq number
- */
-u64 btrfs_tree_mod_seq_prev(u64 seq)
-{
- return (seq & 0xffffffff00000000ull) - 1ull;
-}
-
-/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
@@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- u64 seq;
-
tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info);
- return seq;
+ return elem->seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
BUG_ON(!tm);
- spin_lock(&fs_info->tree_mod_seq_lock);
- tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
@@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
* snapshot and the block was not allocated by tree relocation,
* we know the block is not shared.
*/
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
return 1;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
return 1;
#endif
@@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(buf);
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret)
return ret;
@@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
/* ensure we can see the force_cow */
smp_rmb();
@@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !root->force_cow)
+ !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
return 1;
}
@@ -2769,9 +2741,13 @@ again:
* the commit roots are read only
* so we always do read locks
*/
+ if (p->need_commit_sem)
+ down_read(&root->fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&root->fs_info->commit_root_sem);
if (!p->skip_locking)
btrfs_tree_read_lock(b);
} else {
@@ -5121,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
return ret;
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
- if (ret < 0)
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
return 0;
return 1;
}
@@ -5360,7 +5346,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
{
int ret;
int cmp;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *left_path = NULL;
struct btrfs_path *right_path = NULL;
struct btrfs_key left_key;
@@ -5376,9 +5361,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
int advance_right;
u64 left_blockptr;
u64 right_blockptr;
- u64 left_start_ctransid;
- u64 right_start_ctransid;
- u64 ctransid;
+ u64 left_gen;
+ u64 right_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -5402,21 +5386,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->search_commit_root = 1;
right_path->skip_locking = 1;
- spin_lock(&left_root->root_item_lock);
- left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
-
- spin_lock(&right_root->root_item_lock);
- right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
-
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
/*
* Strategy: Go to the first items of both trees. Then do
*
@@ -5453,6 +5422,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
* the right if possible or go up and right.
*/
+ down_read(&left_root->fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
left_path->nodes[left_level] = left_root->commit_root;
@@ -5462,6 +5432,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level = right_level;
right_path->nodes[right_level] = right_root->commit_root;
extent_buffer_get(right_path->nodes[right_level]);
+ up_read(&left_root->fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5480,67 +5451,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. This means, we need to
- * join and leave transactions for every item that we process.
- */
- if (trans && btrfs_should_end_transaction(trans, left_root)) {
- btrfs_release_path(left_path);
- btrfs_release_path(right_path);
-
- ret = btrfs_end_transaction(trans, left_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- }
- /* now rejoin the transaction */
- if (!trans) {
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- spin_lock(&left_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
- if (ctransid != left_start_ctransid)
- left_start_ctransid = 0;
-
- spin_lock(&right_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
- if (ctransid != right_start_ctransid)
- right_start_ctransid = 0;
-
- if (!left_start_ctransid || !right_start_ctransid) {
- WARN(1, KERN_WARNING
- "BTRFS: btrfs_compare_tree detected "
- "a change in one of the trees while "
- "iterating. This is probably a "
- "bug.\n");
- ret = -EIO;
- goto out;
- }
-
- /*
- * the commit root may have changed, so start again
- * where we stopped
- */
- left_path->lowest_level = left_level;
- right_path->lowest_level = right_level;
- ret = btrfs_search_slot(NULL, left_root,
- &left_key, left_path, 0, 0);
- if (ret < 0)
- goto out;
- ret = btrfs_search_slot(NULL, right_root,
- &right_key, right_path, 0, 0);
- if (ret < 0)
- goto out;
- }
-
if (advance_left && !left_end_reached) {
ret = tree_advance(left_root, left_path, &left_level,
left_root_level,
@@ -5640,7 +5550,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_blockptr = btrfs_node_blockptr(
right_path->nodes[right_level],
right_path->slots[right_level]);
- if (left_blockptr == right_blockptr) {
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
/*
* As we're on a shared block, don't
* allow to go deeper.
@@ -5663,14 +5580,6 @@ out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
kfree(tmp_buf);
-
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, left_root);
- else
- btrfs_end_transaction(trans, left_root);
- }
-
return ret;
}
@@ -5809,6 +5718,24 @@ again:
ret = 0;
goto done;
}
+ /*
+ * So the above check misses one case:
+ * - after releasing the path above, someone has removed the item that
+ * used to be at the very end of the block, and balance between leafs
+ * gets another one with bigger key.offset to replace it.
+ *
+ * This one should be returned as well, or we can get leaf corruption
+ * later(esp. in __btrfs_drop_extents()).
+ *
+ * And a bit more explanation about this check,
+ * with ret > 0, the key isn't found, the path points to the slot
+ * where it should be inserted, so the path->slots[0] item must be the
+ * bigger one.
+ */
+ if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+ ret = 0;
+ goto done;
+ }
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level]) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519..be91397f4e9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
+#include <linux/workqueue.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -351,6 +352,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_ERROR 0
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
+#define BTRFS_FS_STATE_DEV_REPLACING 3
/* Super block flags */
/* Errors detected */
@@ -608,6 +610,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int need_commit_sem:1;
};
/*
@@ -754,6 +757,12 @@ struct btrfs_dir_item {
#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
+
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
@@ -838,7 +847,10 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- __le64 unused[8];
+ /* BTRFS_BALANCE_ARGS_LIMIT value */
+ __le64 limit;
+
+ __le64 unused[7];
} __attribute__ ((__packed__));
/*
@@ -985,7 +997,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+ BTRFS_SPACE_INFO_GLOBAL_RSV)
enum btrfs_raid_types {
BTRFS_RAID_RAID10,
@@ -1017,6 +1030,12 @@ enum btrfs_raid_types {
*/
#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
+
#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
BTRFS_AVAIL_ALLOC_BIT_SINGLE)
@@ -1104,6 +1123,12 @@ struct btrfs_qgroup_limit_item {
__le64 rsv_excl;
} __attribute__ ((__packed__));
+/* For raid type sysfs entries */
+struct raid_kobject {
+ int raid_type;
+ struct kobject kobj;
+};
+
struct btrfs_space_info {
spinlock_t lock;
@@ -1154,7 +1179,7 @@ struct btrfs_space_info {
wait_queue_head_t wait;
struct kobject kobj;
- struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES];
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1234,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
@@ -1304,6 +1337,8 @@ struct btrfs_stripe_hash_table {
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1439,7 +1474,7 @@ struct btrfs_fs_info {
*/
struct mutex ordered_extent_flush_mutex;
- struct rw_semaphore extent_commit_sem;
+ struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1489,6 +1524,7 @@ struct btrfs_fs_info {
*/
struct list_head ordered_roots;
+ struct mutex delalloc_root_mutex;
spinlock_t delalloc_root_lock;
/* all fs/file tree roots that have delalloc inodes. */
struct list_head delalloc_roots;
@@ -1503,28 +1539,30 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
- struct btrfs_workers generic_worker;
- struct btrfs_workers workers;
- struct btrfs_workers delalloc_workers;
- struct btrfs_workers flush_workers;
- struct btrfs_workers endio_workers;
- struct btrfs_workers endio_meta_workers;
- struct btrfs_workers endio_raid56_workers;
- struct btrfs_workers rmw_workers;
- struct btrfs_workers endio_meta_write_workers;
- struct btrfs_workers endio_write_workers;
- struct btrfs_workers endio_freespace_worker;
- struct btrfs_workers submit_workers;
- struct btrfs_workers caching_workers;
- struct btrfs_workers readahead_workers;
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct btrfs_workqueue *endio_workers;
+ struct btrfs_workqueue *endio_meta_workers;
+ struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *rmw_workers;
+ struct btrfs_workqueue *endio_meta_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *submit_workers;
+ struct btrfs_workqueue *caching_workers;
+ struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
* for the sys_munmap function call path
*/
- struct btrfs_workers fixup_workers;
- struct btrfs_workers delayed_workers;
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ /* the extent workers do delayed refs on the extent allocation tree */
+ struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
@@ -1604,9 +1642,9 @@ struct btrfs_fs_info {
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
int scrub_workers_refcnt;
- struct btrfs_workers scrub_workers;
- struct btrfs_workers scrub_wr_completion_workers;
- struct btrfs_workers scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_workers;
+ struct btrfs_workqueue *scrub_wr_completion_workers;
+ struct btrfs_workqueue *scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1627,7 +1665,10 @@ struct btrfs_fs_info {
/* holds configuration and tracking. Protected by qgroup_lock */
struct rb_root qgroup_tree;
+ struct rb_root qgroup_op_tree;
spinlock_t qgroup_lock;
+ spinlock_t qgroup_op_lock;
+ atomic_t qgroup_op_seq;
/*
* used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1647,7 +1688,7 @@ struct btrfs_fs_info {
/* qgroup rescan items */
struct mutex qgroup_rescan_lock; /* protects the progress item */
struct btrfs_key qgroup_rescan_progress;
- struct btrfs_workers qgroup_rescan_workers;
+ struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
@@ -1674,10 +1715,41 @@ struct btrfs_fs_info {
atomic_t mutually_exclusive_operation_running;
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
};
+struct btrfs_subvolume_writers {
+ struct percpu_counter counter;
+ wait_queue_head_t wait;
+};
+
+/*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code. But the
+ * race is very small, and only the first time the root
+ * is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP 0
+#define BTRFS_ROOT_REF_COWS 1
+#define BTRFS_ROOT_TRACK_DIRTY 2
+#define BTRFS_ROOT_IN_RADIX 3
+#define BTRFS_ROOT_DUMMY_ROOT 4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
+#define BTRFS_ROOT_DEFRAG_RUNNING 6
+#define BTRFS_ROOT_FORCE_COW 7
+#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+
/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
@@ -1689,6 +1761,7 @@ struct btrfs_root {
struct btrfs_root *log_root;
struct btrfs_root *reloc_root;
+ unsigned long state;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
struct btrfs_fs_info *fs_info;
@@ -1702,7 +1775,6 @@ struct btrfs_root {
struct btrfs_block_rsv *block_rsv;
/* free ino cache stuff */
- struct mutex fs_commit_mutex;
struct btrfs_free_space_ctl *free_ino_ctl;
enum btrfs_caching_type cached;
spinlock_t cache_lock;
@@ -1714,13 +1786,16 @@ struct btrfs_root {
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
atomic_t log_writers;
atomic_t log_commit[2];
atomic_t log_batch;
- unsigned long log_transid;
- unsigned long last_log_commit;
+ int log_transid;
+ /* No matter the commit succeeds or not*/
+ int log_transid_committed;
+ /* Just be updated when the commit succeeds. */
+ int last_log_commit;
pid_t log_start_pid;
- bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -1740,23 +1815,13 @@ struct btrfs_root {
u64 highest_objectid;
- /* btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code. But the
- * race is very small, and only the first time the root
- * is added to each transaction. So in_trans_setup
- * is used to tell us when more checks are required
- */
- unsigned long in_trans_setup;
- int ref_cows;
- int track_dirty;
- int in_radix;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- int dummy_root;
+ u64 alloc_bytenr;
#endif
+
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
- int defrag_running;
char *name;
/* the dirty list is only used by non-reference counted roots */
@@ -1770,7 +1835,6 @@ struct btrfs_root {
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
- int orphan_item_inserted;
int orphan_cleanup_state;
spinlock_t inode_lock;
@@ -1788,11 +1852,10 @@ struct btrfs_root {
*/
dev_t anon_dev;
- int force_cow;
-
spinlock_t root_item_lock;
atomic_t refs;
+ struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
/*
* all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1865,8 @@ struct btrfs_root {
struct list_head delalloc_inodes;
struct list_head delalloc_root;
u64 nr_delalloc_inodes;
+
+ struct mutex ordered_extent_mutex;
/*
* this is used by the balancing code to wait for all the pending
* ordered extents
@@ -1822,6 +1887,8 @@ struct btrfs_root {
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
int send_in_progress;
+ struct btrfs_subvolume_writers *subv_writers;
+ atomic_t will_be_snapshoted;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2033,6 +2100,20 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
+#define btrfs_set_and_info(root, opt, fmt, args...) \
+{ \
+ if (!btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_set_opt(root->fs_info->mount_opt, opt); \
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...) \
+{ \
+ if (btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+}
+
/*
* Inode flags
*/
@@ -2749,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2858,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
cpu->vend = le64_to_cpu(disk->vend);
cpu->target = le64_to_cpu(disk->target);
cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
}
static inline void
@@ -2875,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
disk->vend = cpu_to_le64(cpu->vend);
disk->target = cpu_to_le64(cpu->target);
disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
}
/* struct btrfs_super_block */
@@ -3197,6 +3285,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
@@ -3234,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data);
+ struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
@@ -3246,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow);
+ u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3258,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow);
+ u64 root_objectid, u64 owner, u64 offset, int no_quota);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3346,6 +3437,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3519,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
-u64 btrfs_tree_mod_seq_prev(u64 seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
@@ -3666,6 +3758,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em);
+
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
@@ -3723,7 +3821,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4104,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
/* reada.c */
struct reada_control {
struct btrfs_root *root; /* tree to prefetch */
@@ -4021,52 +4125,6 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
-/* qgroup.c */
-struct qgroup_update {
- struct list_head list;
- struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op;
-};
-
-int btrfs_quota_enable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_quota_disable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
-void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- char *name);
-int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid);
-int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- struct btrfs_qgroup_limit *limit);
-int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
-void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
- struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -4083,6 +4141,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
#endif
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6..da775bfdebc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -149,8 +149,8 @@ again:
spin_lock(&root->inode_lock);
ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
if (ret == -EEXIST) {
- kmem_cache_free(delayed_node_cache, node);
spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
radix_tree_preload_end();
goto again;
}
@@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(
mutex_unlock(&delayed_node->mutex);
if (atomic_dec_and_test(&delayed_node->refs)) {
+ bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
if (atomic_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
- kmem_cache_free(delayed_node_cache, delayed_node);
+ free = true;
}
spin_unlock(&root->inode_lock);
+ if (free)
+ kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -1392,11 +1395,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- async_work->work.func = btrfs_async_run_delayed_root;
- async_work->work.flags = 0;
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+ NULL, NULL);
async_work->nr = nr;
- btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+ btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf..6d16bea94e1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ if (ref1->no_quota > ref2->no_quota)
+ return 1;
+ if (ref1->no_quota < ref2->no_quota)
+ return -1;
/* merging of sequenced refs is not allowed */
if (compare_seq) {
if (ref1->seq < ref2->seq)
@@ -199,44 +203,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
*/
static struct btrfs_delayed_ref_head *
find_ref_head(struct rb_root *root, u64 bytenr,
- struct btrfs_delayed_ref_head **last, int return_bigger)
+ int return_bigger)
{
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
- int cmp = 0;
-again:
n = root->rb_node;
entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
- if (last)
- *last = entry;
if (bytenr < entry->node.bytenr)
- cmp = -1;
- else if (bytenr > entry->node.bytenr)
- cmp = 1;
- else
- cmp = 0;
-
- if (cmp < 0)
n = n->rb_left;
- else if (cmp > 0)
+ else if (bytenr > entry->node.bytenr)
n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) {
- if (cmp > 0) {
+ if (bytenr > entry->node.bytenr) {
n = rb_next(&entry->href_node);
if (!n)
n = rb_first(root);
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- bytenr = entry->node.bytenr;
- return_bigger = 0;
- goto again;
+ return entry;
}
return entry;
}
@@ -415,12 +406,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
again:
start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
if (!head && !loop) {
delayed_refs->run_delayed_start = 0;
start = 0;
loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
if (!head)
return NULL;
} else if (!head && loop) {
@@ -508,6 +499,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
ref = btrfs_delayed_node_to_head(update);
BUG_ON(existing_ref->is_data != ref->is_data);
+ spin_lock(&existing_ref->lock);
if (ref->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -549,7 +541,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
- spin_lock(&existing_ref->lock);
existing->ref_mod += update->ref_mod;
spin_unlock(&existing_ref->lock);
}
@@ -648,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action, int for_cow)
+ int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
@@ -658,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
@@ -668,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -710,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action, int for_cow)
+ u64 offset, int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
@@ -722,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+
/* first set the basic ref node struct up */
atomic_set(&ref->refs, 1);
ref->bytenr = bytenr;
@@ -730,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -775,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -806,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
- for_cow);
+ no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -823,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -854,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, for_cow);
+ action, no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -898,7 +892,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
+ return find_ref_head(&delayed_refs->href_root, bytenr, 0);
}
void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 4ba9b93022f..a764e2340d4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+ unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
u64 seq);
/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
- if (for_cow)
- return 0;
-
- if (rootid == BTRFS_FS_TREE_OBJECTID)
- return 1;
-
- if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
- return 1;
-
- return 0;
-}
-
-/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b2..eea26e1b2fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -36,6 +36,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
@@ -313,7 +314,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (btrfs_fs_incompat(fs_info, RAID56)) {
btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
- return -EINVAL;
+ return -EOPNOTSUPP;
}
switch (args->start.cont_reading_from_srcdev_mode) {
@@ -431,6 +432,35 @@ leave_no_lock:
return ret;
}
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ do {
+ prepare_to_wait(&fs_info->replace_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ writers = percpu_counter_sum(&fs_info->bio_counter);
+ if (writers)
+ schedule();
+ finish_wait(&fs_info->replace_wait, &wait);
+ } while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -458,17 +488,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device = dev_replace->srcdev;
btrfs_dev_replace_unlock(dev_replace);
- /* replace old device with new one in mapping tree */
- if (!scrub_ret)
- btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
- src_device,
- tgt_device);
-
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -484,6 +508,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
@@ -494,7 +519,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- if (scrub_ret) {
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret) {
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+ } else {
printk_in_rcu(KERN_ERR
"BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
src_device->missing ? "<missing disk>" :
@@ -503,6 +533,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +563,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+
+ btrfs_rm_dev_replace_blocked(fs_info);
+
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
+
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
@@ -543,6 +582,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -862,3 +902,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
mutex_unlock(&dev_replace->lock_management_lock);
}
}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_dec(&fs_info->bio_counter);
+
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+ DEFINE_WAIT(wait);
+again:
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ btrfs_bio_counter_dec(fs_info);
+ wait_event(fs_info->replace_wait,
+ !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state));
+ goto again;
+ }
+
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0e69295d003..08e65e9cf2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,7 +26,6 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
-#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
@@ -35,6 +34,7 @@
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
@@ -49,6 +49,7 @@
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
+#include "qgroup.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -244,7 +245,7 @@ out:
u32 btrfs_csum_data(char *data, u32 seed, size_t len)
{
- return crc32c(seed, data, len);
+ return btrfs_crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, char *result)
@@ -329,6 +330,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
+ bool need_lock = (current->journal_info ==
+ (void *)BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -336,6 +339,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
+ if (need_lock) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
+
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
0, &cached_state);
if (extent_buffer_uptodate(eb) &&
@@ -347,10 +355,22 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
"found %llu\n",
eb->start, parent_transid, btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(eb);
+
+ /*
+ * Things reading via commit roots that don't have normal protection,
+ * like send, can have a really old block in cache that may point at a
+ * block that has been free'd and re-allocated. So don't clear uptodate
+ * if we find an eb that is under IO (dirty/writeback) because we could
+ * end up reading in the stale data and then writing it back out and
+ * making everybody very sad.
+ */
+ if (!extent_buffer_under_io(eb))
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
+ if (need_lock)
+ btrfs_tree_read_unlock_blocking(eb);
return ret;
}
@@ -678,32 +698,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
fs_info = end_io_wq->info;
end_io_wq->error = err;
- end_io_wq->work.func = end_workqueue_fn;
- end_io_wq->work.flags = 0;
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- btrfs_queue_worker(&fs_info->endio_meta_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- btrfs_queue_worker(&fs_info->endio_freespace_worker,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_freespace_worker,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_write_workers,
+ &end_io_wq->work);
} else {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata)
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_workers,
+ &end_io_wq->work);
}
}
@@ -738,7 +757,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
{
unsigned long limit = min_t(unsigned long,
- info->workers.max_workers,
+ info->thread_pool_size,
info->fs_devices->open_devices);
return 256 * limit;
}
@@ -811,11 +830,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
- async->work.func = run_one_async_start;
- async->work.ordered_func = run_one_async_done;
- async->work.ordered_free = run_one_async_free;
+ btrfs_init_work(&async->work, run_one_async_start,
+ run_one_async_done, run_one_async_free);
- async->work.flags = 0;
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
@@ -824,9 +841,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
- btrfs_set_work_high_prio(&async->work);
+ btrfs_set_work_high_priority(&async->work);
- btrfs_queue_worker(&fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->workers, &async->work);
while (atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
@@ -1094,6 +1111,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return alloc_test_extent_buffer(root->fs_info, bytenr,
+ blocksize);
+#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1149,6 +1171,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
}
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+ struct btrfs_subvolume_writers *writers;
+ int ret;
+
+ writers = kmalloc(sizeof(*writers), GFP_NOFS);
+ if (!writers)
+ return ERR_PTR(-ENOMEM);
+
+ ret = percpu_counter_init(&writers->counter, 0);
+ if (ret < 0) {
+ kfree(writers);
+ return ERR_PTR(ret);
+ }
+
+ init_waitqueue_head(&writers->wait);
+ return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+ percpu_counter_destroy(&writers->counter);
+ kfree(writers);
+}
+
static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
u32 stripesize, struct btrfs_root *root,
struct btrfs_fs_info *fs_info,
@@ -1160,10 +1208,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->nodesize = nodesize;
root->leafsize = leafsize;
root->stripesize = stripesize;
- root->ref_cows = 0;
- root->track_dirty = 0;
- root->in_radix = 0;
- root->orphan_item_inserted = 0;
+ root->state = 0;
root->orphan_cleanup_state = 0;
root->objectid = objectid;
@@ -1194,16 +1239,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->log_extents_lock[1]);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
+ mutex_init(&root->ordered_extent_mutex);
+ mutex_init(&root->delalloc_mutex);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
+ atomic_set(&root->will_be_snapshoted, 0);
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
if (fs_info)
extent_io_tree_init(&root->dirty_log_pages,
@@ -1218,7 +1269,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
else
root->defrag_trans_start = 0;
init_completion(&root->kobj_unregister);
- root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1243,7 +1293,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
if (!root)
return ERR_PTR(-ENOMEM);
__setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
- root->dummy_root = 1;
+ set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+ root->alloc_bytenr = 0;
return root;
}
@@ -1294,8 +1345,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
root->commit_root = btrfs_root_node(root);
- root->track_dirty = 1;
-
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
root->root_item.flags = 0;
root->root_item.byte_limit = 0;
@@ -1324,6 +1374,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
fail:
if (leaf) {
btrfs_tree_unlock(leaf);
+ free_extent_buffer(root->commit_root);
free_extent_buffer(leaf);
}
kfree(root);
@@ -1349,13 +1400,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
/*
+ * DON'T set REF_COWS for log trees
+ *
* log trees do not get reference counted because they go away
* before a real commit is actually done. They do store pointers
* to file data extents, and those reference counts still get
* updated (along with back refs to the log tree).
*/
- root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
@@ -1417,6 +1470,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
return 0;
}
@@ -1488,7 +1542,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
return root;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- root->ref_cows = 1;
+ set_bit(BTRFS_ROOT_REF_COWS, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1498,6 +1552,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root)
{
int ret;
+ struct btrfs_subvolume_writers *writers;
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,15 +1562,24 @@ int btrfs_init_fs_root(struct btrfs_root *root)
goto fail;
}
+ writers = btrfs_alloc_subvolume_writers();
+ if (IS_ERR(writers)) {
+ ret = PTR_ERR(writers);
+ goto fail;
+ }
+ root->subv_writers = writers;
+
btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ goto free_writers;
return 0;
+
+free_writers:
+ btrfs_free_subvolume_writers(root->subv_writers);
fail:
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
@@ -1548,7 +1612,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
(unsigned long)root->root_key.objectid,
root);
if (ret == 0)
- root->in_radix = 1;
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
@@ -1604,7 +1668,7 @@ again:
if (ret < 0)
goto fail;
if (ret == 0)
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
@@ -1990,23 +2054,23 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
- btrfs_stop_workers(&fs_info->generic_worker);
- btrfs_stop_workers(&fs_info->fixup_workers);
- btrfs_stop_workers(&fs_info->delalloc_workers);
- btrfs_stop_workers(&fs_info->workers);
- btrfs_stop_workers(&fs_info->endio_workers);
- btrfs_stop_workers(&fs_info->endio_meta_workers);
- btrfs_stop_workers(&fs_info->endio_raid56_workers);
- btrfs_stop_workers(&fs_info->rmw_workers);
- btrfs_stop_workers(&fs_info->endio_meta_write_workers);
- btrfs_stop_workers(&fs_info->endio_write_workers);
- btrfs_stop_workers(&fs_info->endio_freespace_worker);
- btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->delayed_workers);
- btrfs_stop_workers(&fs_info->caching_workers);
- btrfs_stop_workers(&fs_info->readahead_workers);
- btrfs_stop_workers(&fs_info->flush_workers);
- btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->fixup_workers);
+ btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->workers);
+ btrfs_destroy_workqueue(fs_info->endio_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+ btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->rmw_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+ btrfs_destroy_workqueue(fs_info->submit_workers);
+ btrfs_destroy_workqueue(fs_info->delayed_workers);
+ btrfs_destroy_workqueue(fs_info->caching_workers);
+ btrfs_destroy_workqueue(fs_info->readahead_workers);
+ btrfs_destroy_workqueue(fs_info->flush_workers);
+ btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->extent_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2033,7 +2097,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->chunk_root);
}
-static void del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2044,7 +2108,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (gang[0]->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
@@ -2097,6 +2161,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
+ int max_active;
+ int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
bool create_uuid_tree;
bool check_uuid_tree;
@@ -2133,10 +2199,16 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
+ ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ if (ret) {
+ err = ret;
+ goto fail_delalloc_bytes;
+ }
+
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
- goto fail_delalloc_bytes;
+ goto fail_bio_counter;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2156,9 +2228,11 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
+ mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
init_completion(&fs_info->kobj_unregister);
@@ -2180,6 +2254,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
@@ -2211,6 +2286,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->replace_wait);
init_waitqueue_head(&fs_info->scrub_pause_wait);
fs_info->scrub_workers_refcnt = 0;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2224,6 +2300,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2274,7 +2351,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
@@ -2287,6 +2364,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
@@ -2458,104 +2536,73 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- btrfs_init_workers(&fs_info->generic_worker,
- "genwork", 1, NULL);
-
- btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
+ max_active = fs_info->thread_pool_size;
- btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
- btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
- btrfs_init_workers(&fs_info->submit_workers, "submit",
- min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size), NULL);
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
- btrfs_init_workers(&fs_info->caching_workers, "cache",
- fs_info->thread_pool_size, NULL);
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
- /* a higher idle thresh on the submit workers makes it much more
+ /*
+ * a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
*/
- fs_info->submit_workers.idle_thresh = 64;
-
- fs_info->workers.idle_thresh = 16;
- fs_info->workers.ordered = 1;
-
- fs_info->delalloc_workers.idle_thresh = 2;
- fs_info->delalloc_workers.ordered = 1;
-
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_raid56_workers,
- "endio-raid56", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->rmw_workers,
- "rmw", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
- 1, &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->readahead_workers, "readahead",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
- &fs_info->generic_worker);
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
- fs_info->endio_workers.idle_thresh = 4;
- fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_raid56_workers.idle_thresh = 4;
- fs_info->rmw_workers.idle_thresh = 2;
-
- fs_info->endio_write_workers.idle_thresh = 2;
- fs_info->endio_meta_write_workers.idle_thresh = 2;
- fs_info->readahead_workers.idle_thresh = 2;
-
- /*
- * btrfs_start_workers can really only fail because of ENOMEM so just
- * return -ENOMEM if any of these fail.
- */
- ret = btrfs_start_workers(&fs_info->workers);
- ret |= btrfs_start_workers(&fs_info->generic_worker);
- ret |= btrfs_start_workers(&fs_info->submit_workers);
- ret |= btrfs_start_workers(&fs_info->delalloc_workers);
- ret |= btrfs_start_workers(&fs_info->fixup_workers);
- ret |= btrfs_start_workers(&fs_info->endio_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
- ret |= btrfs_start_workers(&fs_info->rmw_workers);
- ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
- ret |= btrfs_start_workers(&fs_info->delayed_workers);
- ret |= btrfs_start_workers(&fs_info->caching_workers);
- ret |= btrfs_start_workers(&fs_info->readahead_workers);
- ret |= btrfs_start_workers(&fs_info->flush_workers);
- ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
- if (ret) {
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
}
@@ -2662,7 +2709,7 @@ retry_root_backup:
ret = PTR_ERR(extent_root);
goto recovery_tree_root;
}
- extent_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
fs_info->extent_root = extent_root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
@@ -2671,7 +2718,7 @@ retry_root_backup:
ret = PTR_ERR(dev_root);
goto recovery_tree_root;
}
- dev_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
fs_info->dev_root = dev_root;
btrfs_init_devices_late(fs_info);
@@ -2681,13 +2728,13 @@ retry_root_backup:
ret = PTR_ERR(csum_root);
goto recovery_tree_root;
}
- csum_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
fs_info->csum_root = csum_root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
quota_root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(quota_root)) {
- quota_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
fs_info->quota_root = quota_root;
@@ -2702,7 +2749,7 @@ retry_root_backup:
create_uuid_tree = true;
check_uuid_tree = false;
} else {
- uuid_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
fs_info->uuid_root = uuid_root;
create_uuid_tree = false;
check_uuid_tree =
@@ -2830,7 +2877,7 @@ retry_root_backup:
printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
@@ -2839,26 +2886,28 @@ retry_root_backup:
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
if (sb->s_flags & MS_RDONLY) {
ret = btrfs_commit_super(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
}
ret = btrfs_find_orphan_roots(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
printk(KERN_WARNING
"BTRFS: failed to recover relocation\n");
@@ -2935,7 +2984,7 @@ fail_qgroup:
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
btrfs_cleanup_transaction(fs_info->tree_root);
- del_fs_roots(fs_info);
+ btrfs_free_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -2963,6 +3012,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
+fail_bio_counter:
+ percpu_counter_destroy(&fs_info->bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3244,6 +3295,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_send++;
continue;
@@ -3258,6 +3311,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_wait++;
continue;
@@ -3464,8 +3519,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_free_log(NULL, root);
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
+ if (root->free_ino_pinned)
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ if (root->free_ino_ctl)
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
}
@@ -3477,6 +3534,8 @@ static void free_fs_root(struct btrfs_root *root)
root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
+ if (root->subv_writers)
+ btrfs_free_subvolume_writers(root->subv_writers);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->free_ino_ctl);
@@ -3494,28 +3553,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i;
- int ret;
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+ int index;
while (1) {
+ index = srcu_read_lock(&fs_info->subvol_srcu);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret)
+ if (!ret) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
break;
-
+ }
root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
for (i = 0; i < ret; i++) {
- int err;
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_fs_root(gang[i]);
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
- return err;
+ break;
+ btrfs_put_fs_root(gang[i]);
}
root_objectid++;
}
- return 0;
+
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_fs_root(gang[i]);
+ }
+ return err;
}
int btrfs_commit_super(struct btrfs_root *root)
@@ -3564,6 +3646,8 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
@@ -3588,12 +3672,17 @@ int close_ctree(struct btrfs_root *root)
btrfs_sysfs_remove_one(fs_info);
- del_fs_roots(fs_info);
+ btrfs_free_fs_roots(fs_info);
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
+ /*
+ * we must make sure there is not any read request to
+ * submit after we stopping all workers.
+ */
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
free_root_pointers(fs_info, 1);
@@ -3610,6 +3699,7 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->bio_counter);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
@@ -3669,6 +3759,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
buf->len,
root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ btrfs_print_leaf(root, buf);
+ ASSERT(0);
+ }
+#endif
}
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3791,9 +3887,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
btrfs_destroy_ordered_extents(root);
- cond_resched_lock(&fs_info->ordered_root_lock);
+ cond_resched();
+ spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
}
@@ -3839,7 +3937,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
rb_erase(&ref->rb_node, &head->ref_root);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
- cond_resched_lock(&head->lock);
}
if (head->must_insert_reserved)
pin_bytes = true;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 53059df350f..23ce3ceba0a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9c9ecc93ae2..813537f362f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,16 +26,16 @@
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include "hash.h"
-#include "ctree.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
-#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "math.h"
#include "sysfs.h"
+#include "qgroup.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op);
+ struct btrfs_delayed_extent_op *extra_op,
+ int no_quota);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins);
+ int level, struct btrfs_key *ins,
+ int no_quota);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 num_bytes, int reserve,
+ int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -419,7 +422,7 @@ static noinline void caching_thread(struct btrfs_work *work)
again:
mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->extent_commit_sem);
+ down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -443,10 +446,10 @@ next:
break;
if (need_resched() ||
- rwsem_is_contended(&fs_info->extent_commit_sem)) {
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
caching_ctl->progress = last;
btrfs_release_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
goto again;
@@ -513,7 +516,7 @@ next:
err:
btrfs_free_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
free_excluded_extents(extent_root, block_group);
@@ -549,7 +552,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- caching_ctl->work.func = caching_thread;
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -633,14 +636,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
return 0;
}
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
btrfs_get_block_group(cache);
- btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
return ret;
}
@@ -1271,7 +1274,7 @@ fail:
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop)
+ int refs_to_drop, int *last_ref)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1307,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
+ *last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1542,6 +1546,7 @@ again:
ret = 0;
}
if (ret) {
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
@@ -1763,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int *last_ref)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1807,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
+ *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = (unsigned long)iref;
@@ -1838,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
update_inline_extent_backref(root, path, iref,
- refs_to_add, extent_op);
+ refs_to_add, extent_op, NULL);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(root, path, iref, parent,
root_objectid, owner, offset,
@@ -1871,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data)
+ int refs_to_drop, int is_data, int *last_ref)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
update_inline_extent_backref(root, path, iref,
- -refs_to_drop, NULL);
+ -refs_to_drop, NULL, last_ref);
} else if (is_data) {
- ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+ last_ref);
} else {
+ *last_ref = 1;
ret = btrfs_del_item(trans, root, path);
}
return ret;
@@ -1945,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow)
+ u64 root_objectid, u64 owner, u64 offset,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1957,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
}
return ret;
}
@@ -1972,31 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
+ int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
+ struct btrfs_key key;
u64 refs;
int ret;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+ no_quota = 1;
+
path->reada = 1;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
- ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
- path, bytenr, num_bytes, parent,
+ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+ bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if (ret != -EAGAIN)
+ if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
goto out;
+ /*
+ * Ok we were able to insert an inline extent and it appears to be a new
+ * reference, deal with the qgroup accounting.
+ */
+ if (!ret && !no_quota) {
+ ASSERT(root->fs_info->quota_enabled);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
+ btrfs_release_path(path);
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ goto out;
+ }
+
+ /*
+ * Ok we had -EAGAIN which means we didn't have space to insert and
+ * inline extent ref, so just update the reference count and add a
+ * normal backref.
+ */
leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
+ if (refs)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2004,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ if (ret)
+ goto out;
+ }
+
path->reada = 1;
path->leave_spinning = 1;
-
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
@@ -2040,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
if (extent_op)
@@ -2055,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ node->no_quota, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ extent_op, node->no_quota);
} else {
BUG();
}
@@ -2198,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
ins.objectid = node->bytenr;
if (skinny_metadata) {
@@ -2217,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins);
+ ref->level, &ins,
+ node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, node->no_quota,
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, extent_op,
+ node->no_quota);
} else {
BUG();
}
@@ -2385,6 +2435,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
locked_ref = NULL;
cond_resched();
+ count++;
continue;
}
@@ -2443,7 +2494,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root)) {
+ if (rb_first(&locked_ref->ref_root) ||
+ locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
continue;
@@ -2571,42 +2623,6 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct qgroup_update *qgroup_update;
- int ret = 0;
-
- if (list_empty(&trans->qgroup_ref_list) !=
- !trans->delayed_ref_elem.seq) {
- /* list without seq or seq without list */
- btrfs_err(fs_info,
- "qgroup accounting update error, list is%s empty, seq is %#x.%x",
- list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
- }
-
- if (!trans->delayed_ref_elem.seq)
- return 0;
-
- while (!list_empty(&trans->qgroup_ref_list)) {
- qgroup_update = list_first_entry(&trans->qgroup_ref_list,
- struct qgroup_update, list);
- list_del(&qgroup_update->list);
- if (!ret)
- ret = btrfs_qgroup_account_ref(
- trans, fs_info, qgroup_update->node,
- qgroup_update->extent_op);
- kfree(qgroup_update);
- }
-
- btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
-
- return ret;
-}
-
static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
{
u64 num_bytes;
@@ -2659,15 +2675,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
u64 avg_runtime;
+ u64 val;
smp_mb();
avg_runtime = fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
if (num_entries * avg_runtime >= NSEC_PER_SEC)
return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
return btrfs_check_space_for_delayed_refs(trans, root);
}
+struct async_delayed_refs {
+ struct btrfs_root *root;
+ int count;
+ int error;
+ int sync;
+ struct completion wait;
+ struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+ struct async_delayed_refs *async;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ async = container_of(work, struct async_delayed_refs, work);
+
+ trans = btrfs_join_transaction(async->root);
+ if (IS_ERR(trans)) {
+ async->error = PTR_ERR(trans);
+ goto done;
+ }
+
+ /*
+ * trans->sync means that when we call end_transaciton, we won't
+ * wait on delayed refs
+ */
+ trans->sync = true;
+ ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+ if (ret)
+ async->error = ret;
+
+ ret = btrfs_end_transaction(trans, async->root);
+ if (ret && !async->error)
+ async->error = ret;
+done:
+ if (async->sync)
+ complete(&async->wait);
+ else
+ kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait)
+{
+ struct async_delayed_refs *async;
+ int ret;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->root = root->fs_info->tree_root;
+ async->count = count;
+ async->error = 0;
+ if (wait)
+ async->sync = 1;
+ else
+ async->sync = 0;
+ init_completion(&async->wait);
+
+ btrfs_init_work(&async->work, delayed_ref_async_start,
+ NULL, NULL);
+
+ btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+ if (wait) {
+ wait_for_completion(&async->wait);
+ ret = async->error;
+ kfree(async);
+ return ret;
+ }
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2695,8 +2790,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0) {
count = atomic_read(&delayed_refs->num_entries) * 2;
@@ -2755,6 +2848,9 @@ again:
goto again;
}
out:
+ ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+ if (ret)
+ return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2961,7 +3057,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int for_cow)
+ int full_backref, int inc, int no_quota)
{
u64 bytenr;
u64 num_bytes;
@@ -2976,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!root->ref_cows && level == 0)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
return 0;
if (inc)
@@ -3011,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, for_cow);
+ key.offset, no_quota);
if (ret)
goto fail;
} else {
@@ -3019,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- for_cow);
+ no_quota);
if (ret)
goto fail;
}
@@ -3030,15 +3130,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3161,7 +3261,8 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root, SPACE_CACHE) ||
+ block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3398,10 +3499,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
return ret;
}
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
- kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
- }
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -3540,11 +3639,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return extended_to_chunk(flags | tmp);
}
-static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
{
unsigned seq;
+ u64 flags;
do {
+ flags = orig_flags;
seq = read_seqbegin(&root->fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -3970,7 +4071,7 @@ static int can_overcommit(struct btrfs_root *root,
}
static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
- unsigned long nr_pages)
+ unsigned long nr_pages, int nr_items)
{
struct super_block *sb = root->fs_info->sb;
@@ -3985,9 +4086,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_roots(root->fs_info, 0);
+ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, nr_items);
}
}
@@ -4044,7 +4145,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- btrfs_writeback_inodes_sb_nr(root, nr_pages);
+ btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
* we do anything.
@@ -4111,13 +4212,9 @@ static int may_commit_transaction(struct btrfs_root *root,
goto commit;
/* See if there is enough pinned space to make this reservation */
- spin_lock(&space_info->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes) >= 0) {
- spin_unlock(&space_info->lock);
+ bytes) >= 0)
goto commit;
- }
- spin_unlock(&space_info->lock);
/*
* See if there is some space in the delayed insertion reservation for
@@ -4126,16 +4223,13 @@ static int may_commit_transaction(struct btrfs_root *root,
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
- spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
commit:
trans = btrfs_join_transaction(root);
@@ -4180,7 +4274,7 @@ static int flush_space(struct btrfs_root *root,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes, orig_bytes,
+ shrink_delalloc(root, num_bytes * 2, orig_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case ALLOC_CHUNK:
@@ -4206,6 +4300,104 @@ static int flush_space(struct btrfs_root *root,
return ret;
}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+ struct btrfs_space_info *space_info)
+{
+ u64 used;
+ u64 expected;
+ u64 to_reclaim;
+
+ to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+ 16 * 1024 * 1024);
+ spin_lock(&space_info->lock);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL)) {
+ to_reclaim = 0;
+ goto out;
+ }
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, 1024 * 1024,
+ BTRFS_RESERVE_FLUSH_ALL))
+ expected = div_factor_fine(space_info->total_bytes, 95);
+ else
+ expected = div_factor_fine(space_info->total_bytes, 90);
+
+ if (used > expected)
+ to_reclaim = used - expected;
+ else
+ to_reclaim = 0;
+ to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+ space_info->bytes_reserved);
+out:
+ spin_unlock(&space_info->lock);
+
+ return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info, u64 used)
+{
+ return (used >= div_factor_fine(space_info->total_bytes, 98) &&
+ !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info)
+{
+ u64 used;
+
+ spin_lock(&space_info->lock);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (need_do_async_reclaim(space_info, fs_info, used)) {
+ spin_unlock(&space_info->lock);
+ return 1;
+ }
+ spin_unlock(&space_info->lock);
+
+ return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ int flush_state;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim)
+ return;
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ flush_state++;
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ return;
+ } while (flush_state <= COMMIT_TRANS);
+
+ if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ queue_work(system_unbound_wq, work);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
@@ -4313,8 +4505,13 @@ again:
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ used += orig_bytes;
+ if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work))
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
}
-
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
@@ -4371,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
block_rsv = trans->block_rsv;
if (root == root->fs_info->csum_root && trans->adding_csums)
@@ -5418,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
@@ -5436,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+ u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -5455,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -5474,9 +5678,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
- struct btrfs_space_info *space_info;
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
@@ -5495,10 +5698,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
- up_write(&fs_info->extent_commit_sem);
-
- list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
- percpu_counter_set(&space_info->total_bytes_pinned, 0);
+ up_write(&fs_info->commit_root_sem);
update_global_block_rsv(fs_info);
}
@@ -5537,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -5623,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -5639,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int num_to_del = 1;
u32 item_size;
u64 refs;
+ int last_ref = 0;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+ if (!info->quota_enabled || !is_fstree(root_objectid))
+ no_quota = 1;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5689,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(iref);
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5724,6 +5931,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (ret > 0 && skinny_metadata) {
skinny_metadata = false;
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
@@ -5750,6 +5958,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
} else {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5805,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
btrfs_err(info, "trying to drop %d refs but we only have %Lu "
- "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+ "for bytenr %Lu", refs_to_drop, refs, bytenr);
ret = -EINVAL;
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5813,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
+ type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -5828,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5849,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -5871,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ btrfs_release_path(path);
+
+ /* Deal with the quota accounting */
+ if (!ret && last_ref && !no_quota) {
+ int mod_seq = 0;
+
+ if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+ type == BTRFS_QGROUP_OPER_SUB_SHARED)
+ mod_seq = 1;
+
+ ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+ bytenr, num_bytes, type,
+ mod_seq);
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5986,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6007,11 +6233,15 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow)
+ u64 owner, u64 offset, int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6027,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BTRFS_DROP_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
offset, BTRFS_DROP_DELAYED_REF,
- NULL, for_cow);
+ NULL, no_quota);
}
return ret;
}
@@ -6141,6 +6371,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -6155,7 +6449,7 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags)
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6243,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -6257,7 +6552,7 @@ search:
u64 offset;
int cached;
- btrfs_get_block_group(block_group);
+ btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
/*
@@ -6305,16 +6600,16 @@ have_block_group:
* the refill lock keeps out other
* people trying to start a new cluster
*/
- spin_lock(&last_ptr->refill_lock);
- used_block_group = last_ptr->block_group;
- if (used_block_group != block_group &&
- (!used_block_group ||
- used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
goto refill_cluster;
- if (used_block_group != block_group)
- btrfs_get_block_group(used_block_group);
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
@@ -6328,16 +6623,15 @@ have_block_group:
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group,
+ delalloc);
block_group = used_block_group;
}
goto checks;
}
WARN_ON(last_ptr->block_group != used_block_group);
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
-refill_cluster:
+release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
@@ -6354,8 +6648,10 @@ refill_cluster:
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
- last_ptr->block_group != block_group) {
+ used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
goto unclustered_alloc;
}
@@ -6365,6 +6661,10 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
@@ -6472,7 +6772,7 @@ checks:
BUG_ON(offset > search_start);
ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type);
+ alloc_type, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -6484,13 +6784,13 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);
@@ -6513,8 +6813,14 @@ loop:
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
- trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
@@ -6531,7 +6837,8 @@ loop:
root, ret);
else
ret = 0;
- btrfs_end_transaction(trans, root);
+ if (!exist)
+ btrfs_end_transaction(trans, root);
if (ret)
goto out;
}
@@ -6596,7 +6903,7 @@ again:
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data)
+ struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
@@ -6606,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags);
+ flags, delalloc);
if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
@@ -6631,7 +6938,8 @@ again:
}
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len, int pin)
+ u64 start, u64 len,
+ int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -6650,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);
@@ -6660,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}
int btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len)
+ u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0);
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1);
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -6732,6 +7040,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ /* Always set parent to 0 here since its exclusive anyway. */
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, ins->offset,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+
ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6746,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins)
+ int level, struct btrfs_key *ins,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6756,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
+ u64 num_bytes = ins->offset;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6789,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ num_bytes = root->leafsize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -6810,6 +7128,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, num_bytes,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+ }
+
ret = update_block_group(root, ins->objectid, root->leafsize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6865,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT);
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -6993,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+ blocksize, level);
+ if (!IS_ERR(buf))
+ root->alloc_bytenr += blocksize;
+ return buf;
+ }
+#endif
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
- empty_size, hint, &ins, 0);
+ empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -7734,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (root->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
@@ -8261,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->extent_commit_sem);
+ down_write(&info->commit_root_sem);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
put_caching_control(caching_ctl);
}
- up_write(&info->extent_commit_sem);
+ up_write(&info->commit_root_sem);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8326,8 +8661,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del(&space_info->list);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
struct kobject *kobj;
- kobj = &space_info->block_group_kobjs[i];
- if (kobj->parent) {
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
kobject_del(kobj);
kobject_put(kobj);
}
@@ -8342,22 +8678,35 @@ static void __link_block_group(struct btrfs_space_info *space_info,
struct btrfs_block_group_cache *cache)
{
int index = get_block_group_index(cache);
+ bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index])) {
- struct kobject *kobj = &space_info->block_group_kobjs[index];
+ if (list_empty(&space_info->block_groups[index]))
+ first = true;
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+
+ if (first) {
+ struct raid_kobject *rkobj;
int ret;
- kobject_get(&space_info->kobj); /* put in release */
- ret = kobject_add(kobj, &space_info->kobj, "%s",
- get_raid_name(index));
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj)
+ goto out_err;
+ rkobj->raid_type = index;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
if (ret) {
- pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
- kobject_put(&space_info->kobj);
+ kobject_put(&rkobj->kobj);
+ goto out_err;
}
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
}
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
+
+ return;
+out_err:
+ pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
}
static struct btrfs_block_group_cache *
@@ -8387,6 +8736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
start);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list);
@@ -8606,7 +8956,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
cache = btrfs_create_block_group_cache(root, chunk_offset, size);
if (!cache)
@@ -8692,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = root->fs_info->tree_root;
struct btrfs_key key;
struct inode *inode;
+ struct kobject *kobj = NULL;
int ret;
int index;
int factor;
@@ -8791,11 +9142,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
*/
list_del_init(&block_group->list);
if (list_empty(&block_group->space_info->block_groups[index])) {
- kobject_del(&block_group->space_info->block_group_kobjs[index]);
- kobject_put(&block_group->space_info->block_group_kobjs[index]);
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
clear_avail_alloc_bits(root->fs_info, block_group->flags);
}
up_write(&block_group->space_info->groups_sem);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
@@ -8937,3 +9292,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
range->len = trimmed;
return ret;
}
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+ percpu_counter_dec(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we wake up
+ * waiters.
+ */
+ smp_mb();
+ if (waitqueue_active(&root->subv_writers->wait))
+ wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+ if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ return 0;
+
+ percpu_counter_inc(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we check for snapshot creation.
+ */
+ smp_mb();
+ if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ btrfs_end_nocow_write(root);
+ return 0;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f127..a389820d158 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
}
}
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+static struct rb_node *tree_insert(struct rb_root *root,
+ struct rb_node *search_start,
+ u64 offset,
struct rb_node *node,
struct rb_node ***p_in,
struct rb_node **parent_in)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p;
struct rb_node *parent = NULL;
struct tree_entry *entry;
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
goto do_insert;
}
+ p = search_start ? &search_start : &root->rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
set_state_bits(tree, state, bits);
- node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+ node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
prealloc->state = orig->state;
orig->start = split;
- node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
- NULL, NULL);
+ node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+ &prealloc->rb_node, NULL, NULL);
if (node) {
free_extent_state(prealloc);
return -EEXIST;
@@ -746,6 +749,7 @@ again:
* our range starts
*/
node = tree_search(tree, start);
+process_node:
if (!node)
break;
@@ -766,7 +770,10 @@ again:
if (start > end)
break;
- cond_resched_lock(&tree->lock);
+ if (!cond_resched_lock(&tree->lock)) {
+ node = rb_next(node);
+ goto process_node;
+ }
}
out:
spin_unlock(&tree->lock);
@@ -1686,6 +1693,7 @@ again:
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
+ cached_state = NULL;
if (!loops) {
max_bytes = PAGE_CACHE_SIZE;
loops = 1;
@@ -2346,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
- int ret;
+ int ret = 0;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2360,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
+ ret = ret < 0 ? ret : -EIO;
+ mapping_set_error(page->mapping, ret);
}
return 0;
}
@@ -2757,7 +2767,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
if (em_cached && *em_cached) {
em = *em_cached;
- if (em->in_tree && start >= em->start &&
+ if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
atomic_inc(&em->refs);
return em;
@@ -3091,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
}
/*
- * the writepage semantics are similar to regular writepage. extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback. Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent). In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
*/
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+ struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ u64 delalloc_start,
+ unsigned long *nr_written)
+{
+ struct extent_io_tree *tree = epd->tree;
+ u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 nr_delalloc;
+ u64 delalloc_to_write = 0;
+ u64 delalloc_end = 0;
+ int ret;
+ int page_started = 0;
+
+ if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+ return 0;
+
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ /* fill_delalloc should be return < 0 for error
+ * but just in case, we use > 0 here meaning the
+ * IO is started, so we don't want to return > 0
+ * unless things are going well.
+ */
+ ret = ret < 0 ? ret : -EIO;
+ goto done;
+ }
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= *nr_written;
+ return 1;
+ }
+
+ ret = 0;
+
+done:
+ return ret;
+}
+
+/*
+ * helper for __extent_writepage. This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+ struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ loff_t i_size,
+ unsigned long nr_written,
+ int write_flags, int *nr_ret)
{
- struct inode *inode = page->mapping->host;
- struct extent_page_data *epd = data;
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
- u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
- u64 last_byte = i_size_read(inode);
u64 block_start;
u64 iosize;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
- int ret;
- int nr = 0;
size_t pg_offset = 0;
size_t blocksize;
- loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
- u64 nr_delalloc;
- u64 delalloc_end;
- int page_started;
- int compressed;
- int write_flags;
- unsigned long nr_written = 0;
- bool fill_delalloc = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC;
- else
- write_flags = WRITE;
-
- trace___extent_writepage(page, inode, wbc);
-
- WARN_ON(!PageLocked(page));
-
- ClearPageError(page);
-
- pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
- if (page->index > end_index ||
- (page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
- unlock_page(page);
- return 0;
- }
-
- if (page->index == end_index) {
- char *userpage;
-
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_CACHE_SIZE - pg_offset);
- kunmap_atomic(userpage);
- flush_dcache_page(page);
- }
- pg_offset = 0;
-
- set_page_extent_mapped(page);
-
- if (!tree->ops || !tree->ops->fill_delalloc)
- fill_delalloc = false;
-
- delalloc_start = start;
- delalloc_end = 0;
- page_started = 0;
- if (!epd->extent_locked && fill_delalloc) {
- u64 delalloc_to_write = 0;
- /*
- * make sure the wbc mapping index is at least updated
- * to this page.
- */
- update_nr_written(page, wbc, 0);
-
- while (delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(inode, tree,
- page,
- &delalloc_start,
- &delalloc_end,
- 128 * 1024 * 1024);
- if (nr_delalloc == 0) {
- delalloc_start = delalloc_end + 1;
- continue;
- }
- ret = tree->ops->fill_delalloc(inode, page,
- delalloc_start,
- delalloc_end,
- &page_started,
- &nr_written);
- /* File system has been set read-only */
- if (ret) {
- SetPageError(page);
- goto done;
- }
- /*
- * delalloc_end is already one less than the total
- * length, so we don't subtract one from
- * PAGE_CACHE_SIZE
- */
- delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
- delalloc_start = delalloc_end + 1;
- }
- if (wbc->nr_to_write < delalloc_to_write) {
- int thresh = 8192;
-
- if (delalloc_to_write < thresh * 2)
- thresh = delalloc_to_write;
- wbc->nr_to_write = min_t(u64, delalloc_to_write,
- thresh);
- }
+ int ret = 0;
+ int nr = 0;
+ bool compressed;
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
- if (page_started) {
- ret = 0;
- /*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
- */
- wbc->nr_to_write -= nr_written;
- goto done_unlocked;
- }
- }
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
@@ -3237,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
wbc->pages_skipped++;
else
redirty_page_for_writepage(wbc, page);
+
update_nr_written(page, wbc, nr_written);
unlock_page(page);
- ret = 0;
+ ret = 1;
goto done_unlocked;
}
}
@@ -3251,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (last_byte <= start) {
+ if (i_size <= start) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -3261,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
- if (cur >= last_byte) {
+ u64 em_end;
+ if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -3271,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
+ ret = PTR_ERR_OR_ZERO(em);
break;
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ em_end = extent_map_end(em);
+ BUG_ON(em_end <= cur);
BUG_ON(end < cur);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
@@ -3313,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset += iosize;
continue;
}
- /* leave this out until we have a page_mkwrite call */
- if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0, NULL)) {
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
@@ -3330,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (ret) {
SetPageError(page);
} else {
- unsigned long max_nr = end_index + 1;
+ unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
@@ -3352,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
nr++;
}
done:
+ *nr_ret = nr;
+
+done_unlocked:
+
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ int write_flags;
+ unsigned long nr_written = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC;
+ else
+ write_flags = WRITE;
+
+ trace___extent_writepage(page, inode, wbc);
+
+ WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage);
+ flush_dcache_page(page);
+ }
+
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+
+ ret = __extent_writepage_io(inode, page, wbc, epd,
+ i_size, nr_written, write_flags, &nr);
+ if (ret == 1)
+ goto done_unlocked;
+
+done:
if (nr == 0) {
/* make sure the mapping tag for page dirty gets cleared */
set_page_writeback(page);
end_page_writeback(page);
}
+ if (PageError(page)) {
+ ret = ret < 0 ? ret : -EIO;
+ end_extent_writepage(page, ret, start, page_end);
+ }
unlock_page(page);
+ return ret;
done_unlocked:
-
- /* drop our reference on any cached states */
- free_extent_state(cached_state);
return 0;
}
@@ -3378,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
-static int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct btrfs_fs_info *fs_info,
- struct extent_page_data *epd)
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
{
unsigned long i, num_pages;
int flush = 0;
@@ -3451,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
@@ -3485,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
bio_put(bio);
}
-static int write_one_eb(struct extent_buffer *eb,
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct btrfs_fs_info *fs_info,
struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -3683,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
+ int err = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
@@ -3769,8 +3842,8 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret)
- done = 1;
+ if (!err && ret < 0)
+ err = ret;
/*
* the filesystem may choose to bump up nr_to_write.
@@ -3782,7 +3855,7 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done) {
+ if (!scanned && !done && !err) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
@@ -3792,7 +3865,7 @@ retry:
goto retry;
}
btrfs_add_delayed_iput(inode);
- return ret;
+ return err;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4303,7 +4376,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-static int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -4503,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
{
unsigned long num_pages, i;
@@ -4512,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- mark_page_accessed(p);
+ if (p != accessed)
+ mark_page_accessed(p);
}
}
@@ -4526,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
+ mark_extent_buffer_accessed(eb, NULL);
return eb;
}
rcu_read_unlock();
@@ -4534,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return NULL;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
+{
+ struct extent_buffer *eb, *exists = NULL;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+ eb = alloc_dummy_extent_buffer(start, len);
+ if (!eb)
+ return NULL;
+ eb->fs_info = fs_info;
+again:
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * We will free dummy extent buffer's if they come into
+ * free_extent_buffer with a ref count of 2, but if we are using this we
+ * want the buffers to stay in memory until we're done with them, so
+ * bump the ref count again.
+ */
+ atomic_inc(&eb->refs);
+ return eb;
+free_eb:
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+#endif
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
@@ -4574,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
spin_unlock(&mapping->private_lock);
unlock_page(p);
page_cache_release(p);
- mark_extent_buffer_accessed(exists);
+ mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -4589,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
WARN_ON(PageDirty(p));
- mark_page_accessed(p);
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -4947,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long min_len, char **map,
unsigned long *map_start,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 58b27e5ab52..ccc264e7bde 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
- wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
@@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+ unsigned long start,
+ unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
@@ -320,6 +322,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -349,5 +352,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
#endif
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57d..225302b39af 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
@@ -73,8 +73,10 @@ void free_extent_map(struct extent_map *em)
return;
WARN_ON(atomic_read(&em->refs) == 0);
if (atomic_dec_and_test(&em->refs)) {
- WARN_ON(em->in_tree);
+ WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -99,8 +101,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- WARN_ON(!entry->in_tree);
-
if (em->start < entry->start)
p = &(*p)->rb_left;
else if (em->start >= extent_map_end(entry))
@@ -128,7 +128,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
if (end > entry->start && em->start < extent_map_end(entry))
return -EEXIST;
- em->in_tree = 1;
rb_link_node(&em->rb_node, orig_parent, p);
rb_insert_color(&em->rb_node, root);
return 0;
@@ -153,8 +152,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
prev = n;
prev_entry = entry;
- WARN_ON(!entry->in_tree);
-
if (offset < entry->start)
n = n->rb_left;
else if (offset >= extent_map_end(entry))
@@ -240,12 +237,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- merge->in_tree = 0;
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
rb_erase(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
}
@@ -257,7 +254,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase(&merge->rb_node, &tree->map);
- merge->in_tree = 0;
+ RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
free_extent_map(merge);
@@ -319,7 +316,21 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- if (em->in_tree)
+ if (extent_map_in_tree(em))
+ try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em,
+ int modified)
+{
+ atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
try_merge_map(tree, em);
}
@@ -342,15 +353,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
if (ret)
goto out;
- atomic_inc(&em->refs);
-
- em->mod_start = em->start;
- em->mod_len = em->len;
-
- if (modified)
- list_move(&em->list, &tree->modified_extents);
- else
- try_merge_map(tree, em);
+ setup_extent_mapping(tree, em, modified);
out:
return ret;
}
@@ -434,6 +437,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
rb_erase(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
return ret;
}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
+{
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ ASSERT(extent_map_in_tree(cur));
+ if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ list_del_init(&cur->list);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ RB_CLEAR_NODE(&cur->rb_node);
+
+ setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f..b2991fd8583 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
@@ -33,7 +34,6 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- unsigned int in_tree;
unsigned int compress_type;
struct list_head list;
};
@@ -44,6 +44,11 @@ struct extent_map_tree {
rwlock_t lock;
};
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+ return !RB_EMPTY_NODE(&em->rb_node);
+}
+
static inline u64 extent_map_end(struct extent_map *em)
{
if (em->start + em->len < em->start)
@@ -64,6 +69,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified);
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 127555b29f5..f46cfe45d68 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
found:
csum += count * csum_size;
nblocks -= count;
+ bio_index += count;
while (count--) {
disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
- bio_index++;
bvec++;
}
}
@@ -750,7 +750,7 @@ again:
int slot = path->slots[0] + 1;
/* we didn't find a csum item, insert one */
nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems - 1) {
+ if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
if (ret == 1)
found_next = 1;
@@ -885,3 +885,79 @@ out:
fail_unlock:
goto out;
}
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_key key;
+ u64 extent_start, extent_end;
+ u64 bytenr;
+ u8 type = btrfs_file_extent_type(leaf, fi);
+ int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_start = key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, slot, fi);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
+ }
+
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, fi);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ return;
+ }
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ em->block_start = bytenr;
+ em->block_len = em->orig_block_len;
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, fi);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ em->block_start = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ /*
+ * Initialize orig_start and block_len with the same values
+ * as in inode.c:btrfs_get_extent().
+ */
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->block_len = (u64)-1;
+ if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+ } else {
+ btrfs_err(root->fs_info,
+ "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+ type, btrfs_ino(inode), extent_start,
+ root->root_key.objectid);
+ }
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f0..1f2b99cb55e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,7 @@
#include "tree-log.h"
#include "locking.h"
#include "volumes.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -425,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
- *
- * Disable pagefault to avoid recursive lock since
- * the pages are already locked
*/
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
- pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -452,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
write_bytes -= copied;
total_copied += copied;
- /* Return to btrfs_file_aio_write to fault page */
+ /* Return to btrfs_file_write_iter to fault page */
if (unlikely(copied == 0))
break;
@@ -475,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
- * clear it here
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- mark_page_accessed(pages[i]);
page_cache_release(pages[i]);
}
}
@@ -591,7 +588,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
- remove_extent_mapping(em_tree, em);
if (no_splits)
goto next;
@@ -622,8 +618,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -661,12 +656,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->orig_block_len = 0;
}
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ ASSERT(ret == 0); /* Logic error */
+ }
free_extent_map(split);
split = NULL;
}
next:
+ if (extent_map_in_tree(em))
+ remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for us */
@@ -713,16 +716,18 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int recow;
int ret;
int modify_tree = -1;
- int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int update_refs;
int found = 0;
int leafs_visited = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size)
+ if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
modify_tree = 0;
+ update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -779,6 +784,18 @@ next_slot:
extent_end = search_start;
}
+ /*
+ * Don't skip extent items representing 0 byte lengths. They
+ * used to be created (bug) if while punching holes we hit
+ * -ENOSPC condition. So if we find one here, just ensure we
+ * delete it, otherwise we would insert a new file extent item
+ * with the same key (offset) as that 0 bytes length file
+ * extent item in the call to setup_items_for_insert() later
+ * in this function.
+ */
+ if (extent_end == key.offset && extent_end >= search_start)
+ goto delete_extent_item;
+
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
@@ -798,7 +815,10 @@ next_slot:
*/
if (start > key.offset && end < extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = start;
@@ -831,7 +851,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 0);
+ start - extent_offset, 1);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -841,7 +861,10 @@ next_slot:
* | -------- extent -------- |
*/
if (start <= key.offset && end < extent_end) {
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
@@ -864,7 +887,10 @@ next_slot:
*/
if (start > key.offset && end >= extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
@@ -883,6 +909,7 @@ next_slot:
* | ------ extent ------ |
*/
if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
del_nr = 1;
@@ -938,34 +965,42 @@ next_slot:
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
- * path->slots[0] for our insertion.
+ * path->slots[0] for our insertion (if replace_extent != 0).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
btrfs_abort_transaction(trans, root, ret);
+ }
- leaf = path->nodes[0];
- /*
- * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
- * is, its contents got pushed to its neighbors), in which case
- * it means path->locks[0] == 0
- */
- if (!ret && replace_extent && leafs_visited == 1 &&
- path->locks[0] &&
- btrfs_leaf_free_space(root, leaf) >=
- sizeof(struct btrfs_item) + extent_item_size) {
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
- *key_inserted = 1;
+ leaf = path->nodes[0];
+ /*
+ * If btrfs_del_items() was called, it might have deleted a leaf, in
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+ if (!ret && replace_extent && leafs_visited == 1 &&
+ (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+ path->locks[0] == BTRFS_WRITE_LOCK) &&
+ btrfs_leaf_free_space(root, leaf) >=
+ sizeof(struct btrfs_item) + extent_item_size) {
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+ if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+ struct btrfs_key slot_key;
+
+ btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+ if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+ path->slots[0]++;
}
+ setup_items_for_insert(root, path, &key,
+ &extent_item_size,
+ extent_item_size,
+ sizeof(struct btrfs_item) +
+ extent_item_size, 1);
+ *key_inserted = 1;
}
if (!replace_extent || !(*key_inserted))
@@ -1173,7 +1208,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset, 1);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1346,11 +1381,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
start_pos, last_pos, 0, cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+ ordered = btrfs_lookup_ordered_range(inode, start_pos,
+ last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset <= last_pos) {
- btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos,
cached_state, GFP_NOFS);
@@ -1358,12 +1393,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- ret = btrfs_wait_ordered_range(inode, start_pos,
- last_pos - start_pos + 1);
- if (ret)
- return ret;
- else
- return -EAGAIN;
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ return -EAGAIN;
}
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1428,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
u64 num_bytes;
int ret;
+ ret = btrfs_start_nocow_write(root);
+ if (!ret)
+ return -ENOSPC;
+
lockstart = round_down(pos, root->sectorsize);
- lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+ lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
while (1) {
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1451,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
+ btrfs_end_nocow_write(root);
} else {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- NULL, GFP_NOFS);
- *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ *write_bytes = min_t(size_t, *write_bytes ,
+ num_bytes - pos + lockstart);
}
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1544,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode,
reserve_bytes);
+ else
+ btrfs_end_nocow_write(root);
break;
}
@@ -1598,6 +1634,9 @@ again:
}
release_bytes = 0;
+ if (only_release_metadata)
+ btrfs_end_nocow_write(root);
+
if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize);
u64 lockend = lockstart +
@@ -1624,37 +1663,34 @@ again:
kfree(pages);
if (release_bytes) {
- if (only_release_metadata)
+ if (only_release_metadata) {
+ btrfs_end_nocow_write(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
- else
+ } else {
btrfs_delalloc_release_space(inode, release_bytes);
+ }
}
return num_written ? num_written : ret;
}
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- loff_t *ppos, size_t count, size_t ocount)
+ struct iov_iter *from,
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
+ if (written < 0 || !iov_iter_count(from))
return written;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = __btrfs_buffered_write(file, &i, pos);
+ written_buffered = __btrfs_buffered_write(file, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1664,7 +1700,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
if (err)
goto out;
written += written_buffered;
- *ppos = pos + written_buffered;
+ iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
out:
@@ -1689,29 +1725,22 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- loff_t *ppos = &iocb->ki_pos;
u64 start_pos;
+ u64 end_pos;
ssize_t num_written = 0;
ssize_t err = 0;
- size_t count, ocount;
+ size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ loff_t pos = iocb->ki_pos;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
- count = ocount;
-
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
@@ -1724,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
+ iov_iter_truncate(from, count);
+
err = file_remove_suid(file);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -1752,7 +1783,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
- err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+ /* Expand hole size to cover write data, preventing empty gap */
+ end_pos = round_up(pos + count, root->sectorsize);
+ err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1763,16 +1796,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = __btrfs_direct_write(iocb, iov, nr_segs,
- pos, ppos, count, ocount);
+ num_written = __btrfs_direct_write(iocb, from, pos);
} else {
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, num_written);
-
- num_written = __btrfs_buffered_write(file, &i, pos);
+ num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
- *ppos = pos + num_written;
+ iocb->ki_pos = pos + num_written;
}
mutex_unlock(&inode->i_mutex);
@@ -1797,7 +1825,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
- if (err < 0 && num_written > 0)
+ if (err < 0)
num_written = err;
}
@@ -1856,8 +1884,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0;
bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1980,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- ret = btrfs_log_dentry_safe(trans, root, dentry);
+ btrfs_init_log_ctx(&ctx);
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -1971,7 +2002,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
- ret = btrfs_sync_log(trans, root);
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_end_transaction(trans, root);
goto out;
@@ -1980,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!full_sync) {
ret = btrfs_wait_ordered_range(inode, start,
end - start + 1);
- if (ret)
+ if (ret) {
+ btrfs_end_transaction(trans, root);
goto out;
+ }
}
ret = btrfs_commit_transaction(trans, root);
} else {
@@ -1993,6 +2026,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -2138,6 +2172,37 @@ out:
return 0;
}
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ return ret;
+ }
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2145,24 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
struct btrfs_path *path;
struct btrfs_block_rsv *rsv;
struct btrfs_trans_handle *trans;
- u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
- u64 lockend = round_down(offset + len,
- BTRFS_I(inode)->root->sectorsize) - 1;
- u64 cur_offset = lockstart;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ u64 cur_offset;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
u64 drop_end;
int ret = 0;
int err = 0;
int rsv_count;
- bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ u64 ino_size;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
+ ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
/*
* We needn't truncate any page which is beyond the end of the file
* because we are sure there is no data there.
@@ -2172,14 +2255,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ if (offset < ino_size)
ret = btrfs_truncate_page(inode, offset, len, 0);
- mutex_unlock(&inode->i_mutex);
- return ret;
+ goto out_only_mutex;
}
/* zero back part of the first page */
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ if (offset < ino_size) {
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2187,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
}
- /* zero the front end of the last page */
- if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
- ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including serveral following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < ino_size) {
+ ret = btrfs_truncate_page(inode,
+ tail_start + tail_len, 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
}
}
@@ -2218,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if ((!ordered ||
(ordered->file_offset + ordered->len <= lockstart ||
ordered->file_offset > lockend)) &&
- !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_UPTODATE, 0,
- cached_state)) {
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -2268,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
BUG_ON(ret);
trans->block_rsv = rsv;
+ cur_offset = lockstart;
+ len = lockend - cur_offset;
while (cur_offset < lockend) {
ret = __btrfs_drop_extents(trans, root, inode, path,
cur_offset, lockend + 1,
@@ -2277,10 +2386,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- break;
+ if (cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
cur_offset = drop_end;
@@ -2305,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
rsv, min_size);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
+
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
}
if (ret) {
@@ -2313,10 +2433,17 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- goto out_trans;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (cur_offset < ino_size && cur_offset < drop_end) {
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
}
out_trans:
@@ -2336,6 +2463,7 @@ out_free:
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
+out_only_mutex:
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
@@ -2597,11 +2725,11 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
- .aio_write = btrfs_file_aio_write,
+ .write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 73f3de7a083..2b0a627cb5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
};
static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root)
+ struct btrfs_root *root, int write)
{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
memset(io_ctl, 0, sizeof(struct io_ctl));
- io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
- GFP_NOFS);
+
+ io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
io_ctl->root = root;
- if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
- io_ctl->check_crcs = 1;
+ io_ctl->check_crcs = check_crcs;
+
return 0;
}
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
if (ret)
return ret;
@@ -831,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
if (!matched) {
__btrfs_remove_free_space_cache(ctl);
- btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+ btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
block_group->key.objectid);
ret = -1;
}
@@ -843,7 +864,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
block_group->key.objectid);
}
@@ -851,90 +872,44 @@ out:
return ret;
}
-/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
- * @path - the path to use
- * @offset - the offset for the key we'll insert
- *
- * This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
- */
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u64 offset)
+static noinline_for_stack
+int write_cache_extent_entries(struct io_ctl *io_ctl,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ int *entries, int *bitmaps,
+ struct list_head *bitmap_list)
{
- struct btrfs_free_space_header *header;
- struct extent_buffer *leaf;
- struct rb_node *node;
- struct list_head *pos, *n;
- struct extent_state *cached_state = NULL;
- struct btrfs_free_cluster *cluster = NULL;
- struct extent_io_tree *unpin = NULL;
- struct io_ctl io_ctl;
- struct list_head bitmap_list;
- struct btrfs_key key;
- u64 start, extent_start, extent_end, len;
- int entries = 0;
- int bitmaps = 0;
int ret;
- int err = -1;
-
- INIT_LIST_HEAD(&bitmap_list);
-
- if (!i_size_read(inode))
- return -1;
-
- ret = io_ctl_init(&io_ctl, inode, root);
- if (ret)
- return -1;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct rb_node *node = rb_first(&ctl->free_space_offset);
/* Get the cluster for this block_group if it exists */
- if (block_group && !list_empty(&block_group->cluster_list))
+ if (block_group && !list_empty(&block_group->cluster_list)) {
cluster = list_entry(block_group->cluster_list.next,
struct btrfs_free_cluster,
block_group_list);
+ }
- /* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
-
- node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
node = rb_first(&cluster->root);
cluster = NULL;
}
- /* Make sure we can fit our crcs into the first page */
- if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
- goto out_nospc;
-
- io_ctl_set_generation(&io_ctl, trans->transid);
-
/* Write out the extent entries */
while (node) {
struct btrfs_free_space *e;
e = rb_entry(node, struct btrfs_free_space, offset_index);
- entries++;
+ *entries += 1;
- ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+ ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
e->bitmap);
if (ret)
- goto out_nospc;
+ goto fail;
if (e->bitmap) {
- list_add_tail(&e->list, &bitmap_list);
- bitmaps++;
+ list_add_tail(&e->list, bitmap_list);
+ *bitmaps += 1;
}
node = rb_next(node);
if (!node && cluster) {
@@ -942,136 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
cluster = NULL;
}
}
+ return 0;
+fail:
+ return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path, u64 offset,
+ int entries, int bitmaps)
+{
+ struct btrfs_key key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ ASSERT(path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != offset) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
+ btrfs_release_path(path);
+ goto fail;
+ }
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct io_ctl *io_ctl,
+ int *entries)
+{
+ u64 start, extent_start, extent_end, len;
+ struct extent_io_tree *unpin = NULL;
+ int ret;
+
+ if (!block_group)
+ return 0;
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
- */
-
- /*
+ *
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
unpin = root->fs_info->pinned_extents;
- if (block_group)
- start = block_group->key.objectid;
+ start = block_group->key.objectid;
- while (block_group && (start < block_group->key.objectid +
- block_group->key.offset)) {
+ while (start < block_group->key.objectid + block_group->key.offset) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
- if (ret) {
- ret = 0;
- break;
- }
+ if (ret)
+ return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->key.objectid +
block_group->key.offset)
- break;
+ return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->key.objectid +
block_group->key.offset, extent_end + 1);
len = extent_end - extent_start;
- entries++;
- ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
+ *entries += 1;
+ ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
start = extent_end;
}
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
/* Write out the bitmaps */
- list_for_each_safe(pos, n, &bitmap_list) {
+ list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+ ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
list_del_init(&entry->list);
}
- /* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(&io_ctl);
-
- ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
- 0, i_size_read(inode), &cached_state);
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ return 0;
+}
- if (ret)
- goto out;
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (ret) {
+ if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
GFP_NOFS);
- goto out;
- }
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
- key.type = 0;
+ return ret;
+}
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
- goto out;
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+
+ list_for_each_safe(pos, n, bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
}
- leaf = path->nodes[0];
- if (ret > 0) {
- struct btrfs_key found_key;
- ASSERT(path->slots[0]);
- path->slots[0]--;
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
- found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL, GFP_NOFS);
- btrfs_release_path(path);
+ io_ctl_drop_pages(io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, cached_state,
+ GFP_NOFS);
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_state *cached_state = NULL;
+ struct io_ctl io_ctl;
+ LIST_HEAD(bitmap_list);
+ int entries = 0;
+ int bitmaps = 0;
+ int ret;
+
+ if (!i_size_read(inode))
+ return -1;
+
+ ret = io_ctl_init(&io_ctl, inode, root, 1);
+ if (ret)
+ return -1;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
goto out;
}
+ spin_unlock(&block_group->lock);
}
- BTRFS_I(inode)->generation = trans->transid;
- header = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_free_space_header);
- btrfs_set_free_space_entries(leaf, header, entries);
- btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
- btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
+ /* Lock all pages first so we can lock the extent safely. */
+ io_ctl_prepare_pages(&io_ctl, inode, 0);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ 0, &cached_state);
+
+ io_ctl_set_generation(&io_ctl, trans->transid);
+
+ /* Write out the extent entries in the free space cache */
+ ret = write_cache_extent_entries(&io_ctl, ctl,
+ block_group, &entries, &bitmaps,
+ &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ */
+ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+ if (ret)
+ goto out_nospc;
+
+ /* At last, we write out all the bitmaps. */
+ ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(&io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(&io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
- err = 0;
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ entries, bitmaps);
out:
io_ctl_free(&io_ctl);
- if (err) {
+ if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return err;
+ return ret;
out_nospc:
- list_for_each_safe(pos, n, &bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
- list_del_init(&entry->list);
- }
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
goto out;
}
@@ -1091,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
+
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab485e57b6f..888fbe19079 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
- mutex_lock(&root->fs_commit_mutex);
+ down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -88,7 +88,7 @@ again:
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->cache_progress = last;
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
@@ -127,7 +127,7 @@ next:
btrfs_unpin_free_ino(root);
out:
wake_up(&root->cache_wait);
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -174,9 +174,13 @@ static void start_caching(struct btrfs_root *root)
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
}
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+ tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
+ if (IS_ERR(tsk)) {
+ btrfs_warn(root->fs_info, "failed to start inode caching task");
+ btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ "disabling inode map caching");
+ }
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -205,42 +209,28 @@ again:
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
-
again:
if (root->cached == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(ctl, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
} else {
- /*
- * If we are in the process of caching free ino chunks,
- * to avoid adding the same inode number to the free_ino
- * tree twice due to cross transaction, we'll leave it
- * in the pinned tree until a transaction is committed
- * or the caching work is done.
- */
-
- mutex_lock(&root->fs_commit_mutex);
+ down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->cache_lock);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->cache_lock);
start_caching(root);
- if (objectid <= root->cache_progress ||
- objectid >= root->highest_objectid)
- __btrfs_add_free_space(ctl, objectid, 1);
- else
- __btrfs_add_free_space(pinned, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
}
}
@@ -250,7 +240,7 @@ again:
* and others will just be dropped, because the commit root we were
* searching has changed.
*
- * Must be called with root->fs_commit_mutex held
+ * Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c4ab9c1894..3668048e16f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* the btree. The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
*/
-static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
@@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if ((end - start + 1) <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -685,7 +693,7 @@ retry:
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint, &ins, 1);
+ 0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
@@ -786,7 +794,7 @@ retry:
out:
return ret;
out_free_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -864,7 +872,8 @@ static noinline int cow_file_range(struct inode *inode,
if (btrfs_is_free_space_inode(inode)) {
WARN_ON_ONCE(1);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock;
}
num_bytes = ALIGN(end - start + 1, blocksize);
@@ -908,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- &ins, 1);
+ &ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -986,7 +995,7 @@ out:
return ret;
out_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -1075,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
- async_cow->work.func = async_cow_start;
- async_cow->work.ordered_func = async_cow_submit;
- async_cow->work.ordered_free = async_cow_free;
- async_cow->work.flags = 0;
+ btrfs_init_work(&async_cow->work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
- btrfs_queue_worker(&root->fs_info->delalloc_workers,
- &async_cow->work);
+ btrfs_queue_work(root->fs_info->delalloc_workers,
+ &async_cow->work);
if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
wait_event(root->fs_info->async_submit_wait,
@@ -1272,6 +1279,15 @@ next_slot:
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
+ * if there are pending snapshots for this root,
+ * we fall into common COW way.
+ */
+ if (!nolock) {
+ err = btrfs_start_nocow_write(root);
+ if (!err)
+ goto out_check;
+ }
+ /*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
* either valid or do not exist.
@@ -1290,6 +1306,8 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto next_slot;
}
if (!nocow) {
@@ -1307,8 +1325,11 @@ out_check:
ret = cow_file_range(inode, locked_page,
cow_start, found_key.offset - 1,
page_started, nr_written, 1);
- if (ret)
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
+ }
cow_start = (u64)-1;
}
@@ -1355,8 +1376,11 @@ out_check:
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- if (ret)
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
+ }
}
extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1364,6 +1388,8 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC, PAGE_UNLOCK |
PAGE_SET_PRIVATE2);
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1843,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
SetPageChecked(page);
page_cache_get(page);
- fixup->work.func = btrfs_writepage_fixup_worker;
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
return -EBUSY;
}
@@ -2239,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return PTR_ERR(root);
}
+ if (btrfs_root_readonly(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
/* step 2: get inode */
key.objectid = backref->inum;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -2568,6 +2599,21 @@ out_kfree:
return NULL;
}
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -2629,7 +2675,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
EXTENT_DEFRAG, 1, cached_state);
if (ret) {
u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
- if (last_snapshot >= BTRFS_I(inode)->generation)
+ if (0 && last_snapshot >= BTRFS_I(inode)->generation)
/* the inode is shared */
new = record_old_file_extents(inode, ordered_extent);
@@ -2647,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out_unlock;
}
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2666,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -2718,7 +2769,7 @@ out:
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_len, 1);
}
@@ -2759,7 +2810,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workers *workers;
+ struct btrfs_workqueue *workers;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2768,14 +2819,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
end - start + 1, uptodate))
return 0;
- ordered_extent->work.func = finish_ordered_fn;
- ordered_extent->work.flags = 0;
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
if (btrfs_is_free_space_inode(inode))
- workers = &root->fs_info->endio_freespace_worker;
+ workers = root->fs_info->endio_freespace_worker;
else
- workers = &root->fs_info->endio_write_workers;
- btrfs_queue_worker(workers, &ordered_extent->work);
+ workers = root->fs_info->endio_write_workers;
+ btrfs_queue_work(workers, &ordered_extent->work);
return 0;
}
@@ -2917,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
root->orphan_block_rsv = NULL;
spin_unlock(&root->orphan_lock);
- if (root->orphan_item_inserted &&
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
btrfs_root_refs(&root->root_item) > 0) {
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret)
btrfs_abort_transaction(trans, root, ret);
else
- root->orphan_item_inserted = 0;
+ clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state);
}
if (block_rsv) {
@@ -3241,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
btrfs_block_rsv_release(root, root->orphan_block_rsv,
(u64)-1);
- if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ if (root->orphan_block_rsv ||
+ test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans, root);
@@ -3443,7 +3495,7 @@ cache_acl:
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(root->fs_info,
- "error loading props for ino %llu (root %llu): %d\n",
+ "error loading props for ino %llu (root %llu): %d",
btrfs_ino(inode),
root->root_key.objectid, ret);
}
@@ -3968,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* not block aligned since we will be keeping the last block of the
* extent just the way it is.
*/
- if (root->ref_cows || root == root->fs_info->tree_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, ALIGN(new_size,
root->sectorsize), (u64)-1, 0);
@@ -4061,7 +4114,9 @@ search_again:
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
- if (root->ref_cows && extent_start != 0)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state) &&
+ extent_start != 0)
inode_sub_bytes(inode, num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
@@ -4075,7 +4130,8 @@ search_again:
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
inode_sub_bytes(inode, num_dec);
}
}
@@ -4090,10 +4146,9 @@ search_again:
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
u32 size = new_size - found_key.offset;
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 -
new_size);
- }
/*
* update the ram bytes to properly reflect
@@ -4103,7 +4158,8 @@ search_again:
size =
btrfs_file_extent_calc_inline_size(size);
btrfs_truncate_item(root, path, size, 1);
- } else if (root->ref_cows) {
+ } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state)) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
}
@@ -4125,8 +4181,9 @@ delete:
} else {
break;
}
- if (found_extent && (root->ref_cows ||
- root == root->fs_info->tree_root)) {
+ if (found_extent &&
+ (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
@@ -4593,7 +4650,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4981,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
again:
@@ -5137,8 +5195,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static void btrfs_dentry_release(struct dentry *dentry)
{
- if (dentry->d_fsdata)
- kfree(dentry->d_fsdata);
+ kfree(dentry->d_fsdata);
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -5154,7 +5211,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(inode);
}
- return d_splice_alias(inode, dentry);
+ return d_materialise_unique(dentry, inode);
}
unsigned char btrfs_filetype_table[] = {
@@ -5522,6 +5579,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
+ int nitems = name ? 2 : 1;
unsigned long ptr;
int ret;
@@ -5541,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
inode->i_ino = objectid;
- if (dir) {
+ if (dir && name) {
trace_btrfs_inode_request(dir);
ret = btrfs_set_inode_index(dir, index);
@@ -5550,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
iput(inode);
return ERR_PTR(ret);
}
+ } else if (dir) {
+ *index = 0;
}
/*
* index_cnt is ignored for everything but a dir,
@@ -5574,21 +5634,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
key[0].offset = 0;
- /*
- * Start new inodes with an inode_ref. This is slightly more
- * efficient for small numbers of hard links since they will
- * be packed into one item. Extended refs will kick in if we
- * add more hard links than can fit in the ref item.
- */
- key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
- key[1].offset = ref_objectid;
-
sizes[0] = sizeof(struct btrfs_inode_item);
- sizes[1] = name_len + sizeof(*ref);
+
+ if (name) {
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
+ key[1].objectid = objectid;
+ btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].offset = ref_objectid;
+
+ sizes[1] = name_len + sizeof(*ref);
+ }
path->leave_spinning = 1;
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail;
@@ -5601,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
- struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
- ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (name) {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
@@ -5642,7 +5707,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
return inode;
fail:
- if (dir)
+ if (dir && name)
BTRFS_I(dir)->index_cnt--;
btrfs_free_path(path);
iput(inode);
@@ -5799,6 +5864,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5872,6 +5938,7 @@ out_unlock:
inode_dec_link_count(inode);
iput(inode);
}
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -5925,11 +5992,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
+ if (inode->i_nlink == 1) {
+ /*
+ * If new hard link count is 1, it's a file created
+ * with open(2) O_TMPFILE flag.
+ */
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ goto fail;
+ }
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
}
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5996,6 +6073,7 @@ out_fail:
btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -6051,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
- if (ret) {
- char *kaddr = kmap_atomic(page);
- unsigned long copy_size = min_t(u64,
- PAGE_CACHE_SIZE - pg_offset,
- max_size - extent_offset);
- memset(kaddr + pg_offset, 0, copy_size);
- kunmap_atomic(kaddr);
- }
kfree(tmp);
- return 0;
+ return ret;
}
/*
@@ -6078,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
{
int ret;
int err = 0;
- u64 bytenr;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6092,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
- int compress_type;
+ const bool new_inline = !page || create;
again:
read_lock(&em_tree->lock);
@@ -6166,7 +6235,6 @@ again:
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
@@ -6201,32 +6269,10 @@ next:
goto not_found_em;
}
- em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, item);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
- item);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
- goto insert;
- }
- if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
- } else {
- bytenr += btrfs_file_extent_offset(leaf, item);
- em->block_start = bytenr;
- em->block_len = em->len;
- if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- }
goto insert;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
@@ -6235,12 +6281,8 @@ next:
size_t extent_offset;
size_t copy_size;
- em->block_start = EXTENT_MAP_INLINE;
- if (!page || create) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
+ if (new_inline)
goto out;
- }
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_offset = page_offset(page) + pg_offset - extent_start;
@@ -6250,10 +6292,6 @@ next:
em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
- if (compress_type) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
@@ -6261,7 +6299,10 @@ next:
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ err = ret;
+ goto out;
+ }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6297,8 +6338,6 @@ next:
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
- } else {
- WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
}
not_found:
em->start = start;
@@ -6515,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
- alloc_hint, &ins, 1);
+ alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
@@ -6550,6 +6589,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
int ret;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 disk_bytenr;
@@ -6626,6 +6666,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
+
+ num_bytes = min(offset + *len, extent_end) - offset;
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 range_end;
+
+ range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+ ret = test_range_bit(io_tree, offset, range_end,
+ EXTENT_DELALLOC, 0, NULL);
+ if (ret) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
btrfs_release_path(path);
/*
@@ -6654,7 +6708,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
- num_bytes = min(offset + *len, extent_end) - offset;
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out;
/*
@@ -6668,6 +6721,76 @@ out:
return ret;
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+ struct radix_tree_root *root = &inode->i_mapping->page_tree;
+ int found = false;
+ void **pagep = NULL;
+ struct page *page = NULL;
+ int start_idx;
+ int end_idx;
+
+ start_idx = start >> PAGE_CACHE_SHIFT;
+
+ /*
+ * end is the last byte in the last page. end == start is legal
+ */
+ end_idx = end >> PAGE_CACHE_SHIFT;
+
+ rcu_read_lock();
+
+ /* Most of the code in this while loop is lifted from
+ * find_get_page. It's been modified to begin searching from a
+ * page and return just the first page found in that range. If the
+ * found idx is less than or equal to the end idx then we know that
+ * a page exists. If no pages are found or if those pages are
+ * outside of the range then we're fine (yay!) */
+ while (page == NULL &&
+ radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ break;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ page = NULL;
+ continue;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ page = NULL;
+ break; /* TODO: Is this relevant for this use case? */
+ }
+
+ if (!page_cache_get_speculative(page)) {
+ page = NULL;
+ continue;
+ }
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+
+ if (page) {
+ if (page->index <= end_idx)
+ found = true;
+ page_cache_release(page);
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
@@ -6692,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
- if (!ordered && (!writing ||
- !test_range_bit(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, EXTENT_UPTODATE, 0,
- *cached_state)))
+ if (!ordered &&
+ (!writing ||
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -7024,10 +7146,9 @@ again:
if (!ret)
goto out_test;
- ordered->work.func = finish_ordered_fn;
- ordered->work.flags = 0;
- btrfs_queue_worker(&root->fs_info->endio_write_workers,
- &ordered->work);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(root->fs_info->endio_write_workers,
+ &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -7078,7 +7199,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
* before atomic variable goto zero, we must make sure
* dip->errors is perceived to be set.
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
}
/* if there are more bios still pending for this dio, just exit */
@@ -7258,7 +7379,7 @@ out_err:
* before atomic variable goto zero, we must
* make sure dip->errors is perceived to be set.
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
if (atomic_dec_and_test(&dip->pending_bios))
bio_io_error(dip->orig_bio);
@@ -7335,7 +7456,7 @@ free_ordered:
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len);
+ ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -7343,39 +7464,30 @@ free_ordered:
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ const struct iov_iter *iter, loff_t offset)
{
int seg;
int i;
- size_t size;
- unsigned long addr;
unsigned blocksize_mask = root->sectorsize - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
if (offset & blocksize_mask)
goto out;
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (rw & WRITE)
- continue;
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (i = seg + 1; i < nr_segs; i++) {
- if (iov[seg].iov_base == iov[i].iov_base)
+ /* If this is a write we don't need to check anymore */
+ if (rw & WRITE)
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
goto out;
}
}
@@ -7385,8 +7497,7 @@ out:
}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -7396,23 +7507,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs))
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
return 0;
atomic_inc(&inode->i_dio_count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
- * The generic stuff only does filemap_write_and_wait_range, which isn't
- * enough if we've written compressed pages to this area, so we need to
- * call btrfs_wait_ordered_range to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so
+ * we need to flush the dirty pages again to make absolutely sure
+ * that any outstanding dirty pages are on disk.
*/
- count = iov_length(iov, nr_segs);
- ret = btrfs_wait_ordered_range(inode, offset, count);
- if (ret)
- return ret;
+ count = iov_iter_count(iter);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, offset, count);
if (rw & WRITE) {
/*
@@ -7436,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
if (ret < 0 && ret != -EIOCBQUEUED)
@@ -7944,7 +8054,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
if (err)
btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d\n",
+ "error inheriting subvolume %llu properties: %d",
new_root->root_key.objectid, err);
err = btrfs_update_inode(trans, new_root, inode);
@@ -8263,7 +8373,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
@@ -8404,7 +8514,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
work->inode = inode;
work->wait = wait;
work->delay_iput = delay_iput;
- work->work.func = btrfs_run_delalloc_work;
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -8419,7 +8529,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+ int nr)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8431,6 +8542,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
+ mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
@@ -8456,19 +8568,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
goto out;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
-
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ ret++;
+ if (nr != -1 && ret >= nr)
+ goto out;
cond_resched();
spin_lock(&root->delalloc_lock);
}
spin_unlock(&root->delalloc_lock);
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
- }
- return 0;
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
@@ -8480,6 +8589,7 @@ out:
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
}
+ mutex_unlock(&root->delalloc_mutex);
return ret;
}
@@ -8490,7 +8600,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EROFS;
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, -1);
+ if (ret > 0)
+ ret = 0;
/*
* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
@@ -8507,7 +8619,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return ret;
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -8518,9 +8631,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice)) {
+ while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_fs_root(root);
@@ -8529,15 +8643,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, nr);
btrfs_put_fs_root(root);
- if (ret)
+ if (ret < 0)
goto out;
+ if (nr != -1) {
+ nr -= ret;
+ WARN_ON(nr < 0);
+ }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
+ ret = 0;
atomic_inc(&fs_info->async_submit_draining);
while (atomic_read(&fs_info->nr_async_submits) ||
atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8665,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
atomic_read(&fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&fs_info->async_submit_draining);
- return 0;
out:
if (!list_empty_careful(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
}
+ mutex_unlock(&fs_info->delalloc_root_mutex);
return ret;
}
@@ -8708,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1);
+ *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8722,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
+ ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8832,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 index;
+ int ret = 0;
+
+ /*
+ * 5 units required for adding orphan entry
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ goto out;
+
+ inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ btrfs_ino(dir), objectid, mode, &index);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
+
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out;
+
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out;
+
+ d_tmpfile(dentry, inode);
+ mark_inode_dirty(inode);
+
+out:
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ iput(inode);
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -8852,6 +9031,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .tmpfile = btrfs_tmpfile,
};
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b0134892dc7..47aceb494d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -58,6 +58,33 @@
#include "dev-replace.h"
#include "props.h"
#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+ __u64 sec;
+ __u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec_32 stime; /* in */
+ struct btrfs_ioctl_timespec_32 rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -109,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
void btrfs_update_iflags(struct inode *inode)
{
struct btrfs_inode *ip = BTRFS_I(inode);
-
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ unsigned int new_fl = 0;
if (ip->flags & BTRFS_INODE_SYNC)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (ip->flags & BTRFS_INODE_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (ip->flags & BTRFS_INODE_APPEND)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (ip->flags & BTRFS_INODE_NOATIME)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ new_fl);
}
/*
@@ -585,6 +615,23 @@ fail:
return ret;
}
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(&root->subv_writers->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ writers = percpu_counter_sum(&root->subv_writers->counter);
+ if (writers)
+ schedule();
+
+ finish_wait(&root->subv_writers->wait, &wait);
+ } while (writers);
+}
+
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry, char *name, int namelen,
u64 *async_transid, bool readonly,
@@ -595,18 +642,24 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ atomic_inc(&root->will_be_snapshoted);
+ smp_mb__after_atomic();
+ btrfs_wait_nocow_write(root);
+
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- return ret;
+ goto out;
btrfs_wait_ordered_extents(root, -1);
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot)
- return -ENOMEM;
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ goto out;
+ }
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +676,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto out;
+ goto free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -662,6 +715,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
+ /*
+ * If orphan cleanup did remove any orphans, it means the tree was
+ * modified and therefore the commit root is not the same as the
+ * current root anymore. This is a problem, because send uses the
+ * commit root and therefore can see inode items that don't exist
+ * in the current root anymore, and for example make calls to
+ * btrfs_iget, which will do tree lookups based on the current root
+ * and not on the commit root. Those lookups will fail, returning a
+ * -ESTALE error, and making send fail with that error. So make sure
+ * a send does not see any orphans we have just removed, and that it
+ * will see the same inodes regardless of whether a transaction
+ * commit happened before it started (meaning that the commit root
+ * will be the same as the current root) or not.
+ */
+ if (readonly && pending_snapshot->snap->node !=
+ pending_snapshot->snap->commit_root) {
+ trans = btrfs_join_transaction(pending_snapshot->snap);
+ if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans,
+ pending_snapshot->snap);
+ if (ret)
+ goto fail;
+ }
+ }
+
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -674,8 +756,10 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-out:
+free:
kfree(pending_snapshot);
+out:
+ atomic_dec(&root->will_be_snapshoted);
return ret;
}
@@ -884,12 +968,14 @@ static int find_new_extents(struct btrfs_root *root,
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
- path->keep_locks = 1;
-
while (1) {
+ path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+process_slot:
if (min_key.objectid != ino)
goto none;
if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +994,12 @@ static int find_new_extents(struct btrfs_root *root,
return 0;
}
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(leaf)) {
+ btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+ goto process_slot;
+ }
+
if (min_key.offset == (u64)-1)
goto none;
@@ -935,10 +1027,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
read_unlock(&em_tree->lock);
if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + len - 1;
+
/* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1);
+ lock_extent_bits(io_tree, start, end, 0, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1);
+ unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
if (IS_ERR(em))
return NULL;
@@ -957,7 +1052,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+ (em->block_start + em->block_len == next->block_start))
ret = false;
free_extent_map(next);
@@ -1076,10 +1172,12 @@ again:
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
- lock_extent(tree, page_start, page_end);
+ lock_extent_bits(tree, page_start, page_end,
+ 0, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
- unlock_extent(tree, page_start, page_end);
+ unlock_extent_cached(tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
if (!ordered)
break;
@@ -1356,8 +1454,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
}
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+ if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
/* the filemap_flush will queue IO into the worker threads, but
@@ -1403,6 +1505,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
+ char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1432,11 +1535,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
- char *end;
sizestr = devstr + 1;
*devstr = '\0';
devstr = vol_args->name;
- devid = simple_strtoull(devstr, &end, 10);
+ ret = kstrtoull(devstr, 10, &devid);
+ if (ret)
+ goto out_free;
if (!devid) {
ret = -EINVAL;
goto out_free;
@@ -1470,8 +1574,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
mod = 1;
sizestr++;
}
- new_size = memparse(sizestr, NULL);
- if (new_size == 0) {
+ new_size = memparse(sizestr, &retptr);
+ if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
goto out_free;
}
@@ -1492,7 +1596,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
- ret = -EINVAL;
+ ret = -ERANGE;
goto out_free;
}
new_size = old_size + new_size;
@@ -1573,7 +1677,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(src_inode)->root->fs_info,
"Snapshot src from another FS");
- ret = -EINVAL;
+ ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1901,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
- ret = -ENOTEMPTY;
+ ret = -EPERM;
+ btrfs_err(root->fs_info, "deleting default subvolume "
+ "%llu is not allowed", key.objectid);
goto out;
}
btrfs_release_path(path);
@@ -1854,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- char *buf,
+ size_t *buf_size,
+ char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
{
@@ -1886,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
if (!key_in_sk(key, sk))
continue;
- if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
item_len = 0;
+ ret = -EOVERFLOW;
+ }
- if (sizeof(sh) + item_len + *sk_offset >
- BTRFS_SEARCH_ARGS_BUFSIZE) {
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
ret = 1;
- goto overflow;
+ goto out;
}
sh.objectid = key->objectid;
@@ -1902,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
sh.transid = found_transid;
/* copy search result header */
- memcpy(buf + *sk_offset, &sh, sizeof(sh));
+ if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += sizeof(sh);
if (item_len) {
- char *p = buf + *sk_offset;
+ char __user *up = ubuf + *sk_offset;
/* copy the item */
- read_extent_buffer(leaf, p,
- item_off, item_len);
+ if (read_extent_buffer_to_user(leaf, up,
+ item_off, item_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += item_len;
}
(*num_found)++;
- if (*num_found >= sk->nr_items)
- break;
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
}
advance_key:
ret = 0;
@@ -1930,22 +2062,37 @@ advance_key:
key->objectid++;
} else
ret = 1;
-overflow:
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
return ret;
}
static noinline int search_ioctl(struct inode *inode,
- struct btrfs_ioctl_search_args *args)
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf)
{
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_ioctl_search_key *sk = &args->key;
struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1979,14 +2126,15 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, args->buf,
+ ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
- if (ret || num_found >= sk->nr_items)
+ if (ret)
break;
}
- ret = 0;
+ if (ret > 0)
+ ret = 0;
err:
sk->nr_items = num_found;
btrfs_free_path(path);
@@ -1996,22 +2144,73 @@ err:
static noinline int btrfs_ioctl_tree_search(struct file *file,
void __user *argp)
{
- struct btrfs_ioctl_search_args *args;
- struct inode *inode;
- int ret;
+ struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_key sk;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = memdup_user(argp, sizeof(*args));
- if (IS_ERR(args))
- return PTR_ERR(args);
+ uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
inode = file_inode(file);
- ret = search_ioctl(inode, args);
- if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
ret = -EFAULT;
- kfree(args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 args;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+ const size_t buf_limit = 16 * 1024 * 1024;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+ return -EOVERFLOW;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
return ret;
}
@@ -2147,6 +2346,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
u64 qgroup_reserved;
int namelen;
int ret;
@@ -2168,6 +2368,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
+
err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
if (err == -EINTR)
goto out_drop_write;
@@ -2229,6 +2430,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
mutex_lock(&inode->i_mutex);
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the i_mutex so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ err = -EPERM;
+ goto out_dput;
+ }
+
err = d_invalidate(dentry);
if (err)
goto out_unlock;
@@ -2274,7 +2496,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- if (!xchg(&dest->orphan_item_inserted, 1)) {
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
@@ -2317,11 +2539,19 @@ out_release:
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ }
mutex_unlock(&inode->i_mutex);
if (!err) {
shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
+ ASSERT(dest->send_in_progress == 0);
/* the last ref */
if (dest->cache_inode) {
@@ -2485,9 +2715,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
if (!fi_args)
return -ENOMEM;
@@ -2502,6 +2729,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
}
mutex_unlock(&fs_devices->device_list_mutex);
+ fi_args->nodesize = root->fs_info->super_copy->nodesize;
+ fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+ fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -2517,9 +2748,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
int ret = 0;
char *s_uuid = NULL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
@@ -2597,10 +2825,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
ordered = btrfs_lookup_first_ordered_extent(inode,
off + len - 1);
- if (!ordered &&
+ if ((!ordered ||
+ ordered->file_offset + ordered->len <= off ||
+ ordered->file_offset >= off + len) &&
!test_range_bit(&BTRFS_I(inode)->io_tree, off,
- off + len - 1, EXTENT_DELALLOC, 0, NULL))
+ off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
break;
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -2840,6 +3073,129 @@ out:
return ret;
}
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr. If it does then we need to update the quota for this root. This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 disko)
+{
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist *roots;
+ struct ulist_iterator uiter;
+ struct ulist_node *root_node = NULL;
+ int ret;
+
+ if (!root->fs_info->quota_enabled)
+ return 1;
+
+ btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((root_node = ulist_next(roots, &uiter))) {
+ if (root_node->val == root->objectid) {
+ ret = 1;
+ break;
+ }
+ }
+ ulist_free(roots);
+out:
+ btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ return ret;
+}
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
+out:
+ return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_path *path,
+ const u64 hole_offset,
+ const u64 hole_len)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ return;
+ }
+
+ if (path) {
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+ em->generation = -1;
+ if (btrfs_file_extent_type(path->nodes[0], fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ em->start = hole_offset;
+ em->len = hole_len;
+ em->ram_bytes = em->len;
+ em->orig_start = hole_offset;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->block_len = 0;
+ em->orig_block_len = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = trans->transid;
+ }
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+
+ if (unlikely(ret))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -2852,7 +3208,8 @@ out:
* @destoff: Offset within @inode to start clone
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff)
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -2863,7 +3220,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- u64 len = olen_aligned;
+ int no_quota;
+ const u64 len = olen_aligned;
+ u64 last_disko = 0;
+ u64 last_dest_end = destoff;
ret = -ENOMEM;
buf = vmalloc(btrfs_level_size(root, 0));
@@ -2880,7 +3240,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
+ key.offset = off;
while (1) {
/*
@@ -2892,9 +3252,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
0, 0);
if (ret < 0)
goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
+ no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -2919,7 +3291,7 @@ process_slot:
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
- u64 endoff;
+ u64 drop_start;
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
@@ -2940,10 +3312,16 @@ process_slot:
extent);
}
- if (key.offset + datal <= off ||
- key.offset >= off + len - 1) {
+ /*
+ * The first search might have left us at an extent
+ * item that ends before our target range's start, can
+ * happen if we have holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
path->slots[0]++;
goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
}
size = btrfs_item_size_nr(leaf, slot);
@@ -2962,6 +3340,18 @@ process_slot:
new_key.offset = destoff;
/*
+ * Deal with a hole that doesn't have an extent item
+ * that represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning
+ * range or at the beginning (fully overlaps it or
+ * partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ /*
* 1 - adjusting old extent (we may have to split it)
* 1 - add new extent
* 1 - inode update
@@ -2979,23 +3369,24 @@ process_slot:
* | ------------- extent ------------- |
*/
- /* substract range b */
+ /* subtract range b */
if (key.offset + datal > off + len)
datal = off + len - key.offset;
- /* substract range a */
+ /* subtract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
+ drop_start,
new_key.offset + datal,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3026,6 +3417,28 @@ process_slot:
datao);
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
+
+ /*
+ * We need to look up the roots that point at
+ * this bytenr and see if the new root does. If
+ * it does not we need to make sure we update
+ * quotas appropriately.
+ */
+ if (disko && root != BTRFS_I(src)->root &&
+ disko != last_disko) {
+ no_quota = check_ref(trans, root,
+ disko);
+ if (no_quota < 0) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ ret = no_quota;
+ goto out;
+ }
+ }
+
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
@@ -3033,7 +3446,7 @@ process_slot:
root->root_key.objectid,
btrfs_ino(inode),
new_key.offset - datao,
- 0);
+ no_quota);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3047,6 +3460,8 @@ process_slot:
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
+ u64 aligned_end = 0;
+
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
@@ -3063,13 +3478,16 @@ process_slot:
size -= skip + trim;
datal -= skip + trim;
+ aligned_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
- new_key.offset + datal,
+ drop_start,
+ aligned_end,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3098,38 +3516,62 @@ process_slot:
inode_add_bytes(inode, datal);
}
+ /* If we have an implicit hole (NO_HOLES feature). */
+ if (drop_start < new_key.offset)
+ clone_update_extent_map(inode, trans,
+ NULL, drop_start,
+ new_key.offset - drop_start);
+
+ clone_update_extent_map(inode, trans, path, 0, 0);
+
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-
- /*
- * we round up to the block size at eof when
- * determining which extents to clone above,
- * but shouldn't round up the file size
- */
- endoff = new_key.offset + datal;
- if (endoff > destoff+olen)
- endoff = destoff+olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(inode, endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_transaction(trans, root);
+ last_dest_end = new_key.offset + datal;
+ ret = clone_finish_inode_update(trans, inode,
+ last_dest_end,
+ destoff, olen);
+ if (ret)
goto out;
- }
- ret = btrfs_end_transaction(trans, root);
+ if (new_key.offset + datal >= destoff + len)
+ break;
}
btrfs_release_path(path);
key.offset++;
}
ret = 0;
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole (NO_HOLES feature is enabled) that
+ * fully or partially overlaps our cloning range at its end.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * 1 - remove extent(s)
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, inode,
+ last_dest_end, destoff + len, 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ clone_update_extent_map(inode, trans, NULL, last_dest_end,
+ destoff + len - last_dest_end);
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen);
+ }
+
out:
- btrfs_release_path(path);
btrfs_free_path(path);
vfree(buf);
return ret;
@@ -3153,8 +3595,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* decompress into destination's address_space (the file offset
* may change, so source mapping won't do), then recompress (or
* otherwise reinsert) a subrange.
- * - allow ranges within the same file to be cloned (provided
- * they don't overlap)?
+ *
+ * - split destination inode's inline extents. The inline extents can
+ * be either compressed or non-compressed.
*/
/* the destination must be opened for writing */
@@ -3240,15 +3683,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_unlock;
}
- /* truncate page cache pages from target inode range */
- truncate_inode_pages_range(&inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len) - 1);
+ /*
+ * Lock the target range too. Right after we replace the file extent
+ * items in the fs tree (which now point to the cloned data), we might
+ * have a worker replace them with extent items relative to a write
+ * operation that was issued before this clone operation (i.e. confront
+ * with inode.c:btrfs_finish_ordered_io).
+ */
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, off, len);
+ lock_extent_range(src, lock_start, lock_len);
+ } else {
+ lock_extent_range(src, off, len);
+ lock_extent_range(inode, destoff, len);
+ }
ret = btrfs_clone(src, inode, off, olen, len, destoff);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+ unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+ } else {
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+ destoff + len - 1);
+ }
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
if (!same_inode) {
if (inode < src) {
@@ -3465,6 +3934,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Global block reserve, exported as a space_info
+ */
+ slot_count++;
+
/* space_slots == 0 means they are asking for a count */
if (space_args.space_slots == 0) {
space_args.total_spaces = slot_count;
@@ -3523,6 +3997,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Add global block reserve
+ */
+ if (slot_count) {
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+ spin_lock(&block_rsv->lock);
+ space.total_bytes = block_rsv->size;
+ space.used_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+ space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+ memcpy(dest, &space, sizeof(space));
+ space_args.total_spaces++;
+ }
+
user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
@@ -3537,20 +4026,6 @@ out:
return ret;
}
-static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg)
-{
- struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
- u64 reserved;
-
- spin_lock(&block_rsv->lock);
- reserved = block_rsv->reserved;
- spin_unlock(&block_rsv->lock);
-
- if (arg && copy_to_user(arg, &reserved, sizeof(reserved)))
- return -EFAULT;
- return 0;
-}
-
/*
* there are many ways the trans_start and trans_end ioctls can lead
* to deadlocks. They should only be used by applications that
@@ -4367,10 +4842,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
return btrfs_qgroup_wait_for_completion(root->fs_info);
}
-static long btrfs_ioctl_set_received_subvol(struct file *file,
- void __user *arg)
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct btrfs_ioctl_received_subvol_args *sa)
{
- struct btrfs_ioctl_received_subvol_args *sa = NULL;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
@@ -4398,13 +4872,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
- sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa)) {
- ret = PTR_ERR(sa);
- sa = NULL;
- goto out;
- }
-
/*
* 1 - root item
* 2 - uuid items (received uuid + subvol uuid)
@@ -4458,14 +4925,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
+out:
+ up_write(&root->fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+ struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ int ret = 0;
+
+ args32 = memdup_user(arg, sizeof(*args32));
+ if (IS_ERR(args32)) {
+ ret = PTR_ERR(args32);
+ args32 = NULL;
+ goto out;
+ }
+
+ args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ if (!args64) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+ args64->stransid = args32->stransid;
+ args64->rtransid = args32->rtransid;
+ args64->stime.sec = args32->stime.sec;
+ args64->stime.nsec = args32->stime.nsec;
+ args64->rtime.sec = args32->rtime.sec;
+ args64->rtime.nsec = args32->rtime.nsec;
+ args64->flags = args32->flags;
+
+ ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ if (ret)
+ goto out;
+
+ memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+ args32->stransid = args64->stransid;
+ args32->rtransid = args64->rtransid;
+ args32->stime.sec = args64->stime.sec;
+ args32->stime.nsec = args64->stime.nsec;
+ args32->rtime.sec = args64->rtime.sec;
+ args32->rtime.nsec = args64->rtime.nsec;
+ args32->flags = args64->flags;
+
+ ret = copy_to_user(arg, args32, sizeof(*args32));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(args32);
+ kfree(args64);
+ return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ int ret = 0;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ sa = NULL;
+ goto out;
+ }
+
+ ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+ if (ret)
+ goto out;
+
ret = copy_to_user(arg, sa, sizeof(*sa));
if (ret)
ret = -EFAULT;
out:
kfree(sa);
- up_write(&root->fs_info->subvol_sem);
- mnt_drop_write_file(file);
return ret;
}
@@ -4525,7 +5068,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
spin_lock(&root->fs_info->super_lock);
strcpy(super_block->label, label);
spin_unlock(&root->fs_info->super_lock);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
out_unlock:
mnt_drop_write_file(file);
@@ -4668,7 +5211,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
if (ret)
return ret;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4689,7 +5232,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
btrfs_set_super_incompat_flags(super_block, newflags);
spin_unlock(&root->fs_info->super_lock);
- return btrfs_end_transaction(trans, root);
+ return btrfs_commit_transaction(trans, root);
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -4749,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
@@ -4757,12 +5302,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_logical_to_ino(root, argp);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(root, argp);
- case BTRFS_IOC_GLOBAL_RSV:
- return btrfs_ioctl_global_rsv(root, argp);
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4786,6 +5329,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_balance_progress(root, argp);
case BTRFS_IOC_SET_RECEIVED_SUBVOL:
return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+ return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
case BTRFS_IOC_SEND:
return btrfs_ioctl_send(file, argp);
case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f237..5665d214924 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers))
return 0;
- read_lock(&eb->lock);
+ if (!read_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
return 0;
- write_lock(&eb->lock);
+
+ if (!write_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
atomic_dec(&eb->write_locks);
if (blockers) {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b47f669aca7..dfad8514f0d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws,
if (ret != LZO_E_OK) {
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,
/* we're making it bigger, give up */
if (tot_in > 8192 && tot_in < tot_out) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -335,7 +335,7 @@ cont:
break;
if (page_in_index + 1 >= total_pages_in) {
- ret = -1;
+ ret = -EIO;
goto done;
}
@@ -358,7 +358,7 @@ cont:
kunmap(pages_in[page_in_index - 1]);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed\n");
- ret = -1;
+ ret = -EIO;
break;
}
@@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed!\n");
- ret = -1;
+ ret = -EIO;
goto out;
}
if (out_len < start_byte) {
- ret = -1;
+ ret = -EIO;
goto out;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e..7187b14faa6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
- "%llu\n", offset);
+ "%llu", offset);
}
/*
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -410,10 +413,13 @@ have_entry:
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -424,27 +430,48 @@ out:
}
/* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
- int index = log->log_transid % 2;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
- spin_lock(&log->log_extents_lock[index]);
- if (list_empty(&ordered->log_list)) {
- list_add_tail(&ordered->log_list, &log->logged_list[index]);
- atomic_inc(&ordered->refs);
- }
- spin_unlock(&log->log_extents_lock[index]);
+ if (!list_empty(&ordered->log_list))
+ continue;
+ list_add_tail(&ordered->log_list, logged_list);
+ atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ while (!list_empty(logged_list)) {
+ ordered = list_first_entry(logged_list,
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log)
+{
+ int index = log->log_transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_splice_tail(logged_list, &log->logged_list[index]);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
@@ -457,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
log_list);
list_del_init(&ordered->log_list);
spin_unlock_irq(&log->log_extents_lock[index]);
+
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ struct inode *inode = ordered->inode;
+ u64 start = ordered->file_offset;
+ u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(!inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
+
btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
@@ -577,7 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
INIT_LIST_HEAD(&splice);
INIT_LIST_HEAD(&works);
- mutex_lock(&root->fs_info->ordered_operations_mutex);
+ mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice) && nr) {
@@ -588,10 +626,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
atomic_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ btrfs_init_work(&ordered->flush_work,
+ btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
spin_lock(&root->ordered_extent_lock);
@@ -608,7 +647,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
btrfs_put_ordered_extent(ordered);
cond_resched();
}
- mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ mutex_unlock(&root->ordered_extent_mutex);
return count;
}
@@ -621,6 +660,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->ordered_operations_mutex);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
while (!list_empty(&splice) && nr) {
@@ -643,6 +683,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
+ mutex_unlock(&fs_info->ordered_operations_mutex);
}
/*
@@ -704,8 +745,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
goto out;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac2..246897058ef 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6efd70d3b64..9626b4ad3b9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
-static void print_extent_item(struct extent_buffer *eb, int slot)
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- int type;
u32 item_size = btrfs_item_size_nr(eb, slot);
u64 flags;
u64 offset;
@@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
flags);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
@@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_disk_root_refs(l, ri));
break;
case BTRFS_EXTENT_ITEM_KEY:
- print_extent_item(l, i);
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
printk(KERN_INFO "\t\ttree block backref\n");
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d74..98cb6b2630f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -32,6 +32,7 @@
#include "ulist.h"
#include "backref.h"
#include "extent_io.h"
+#include "qgroup.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
*/
- u64 tag;
- u64 refcnt;
+ u64 old_refcnt;
+ u64 new_refcnt;
};
/*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
@@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
return -ENOENT;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl)
+{
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup)
+ return -EINVAL;
+ if (qgroup->rfer != rfer || qgroup->excl != excl)
+ return -EINVAL;
+ return 0;
+}
+#endif
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ return 0;
+#endif
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -1174,33 +1201,198 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->seq < oper2->seq)
+ return -1;
+ if (oper1->seq > oper2->seq)
+ return -1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ if (oper1->type < oper2->type)
+ return -1;
+ if (oper1->type > oper2->type)
+ return 1;
+ return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ p = &fs_info->qgroup_op_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper(cur, oper);
+ if (cmp < 0) {
+ p = &(*p)->rb_right;
+ } else if (cmp) {
+ p = &(*p)->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&oper->n, parent, p);
+ rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
/*
- * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
- * the modification into a list that's later used by btrfs_end_transaction to
- * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point. If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
*/
int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type, int mod_seq)
{
- struct qgroup_update *u;
+ struct btrfs_qgroup_operation *oper;
+ int ret;
+
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ return 0;
- BUG_ON(!trans->delayed_ref_elem.seq);
- u = kmalloc(sizeof(*u), GFP_NOFS);
- if (!u)
+ oper = kmalloc(sizeof(*oper), GFP_NOFS);
+ if (!oper)
return -ENOMEM;
- u->node = node;
- u->extent_op = extent_op;
- list_add_tail(&u->list, &trans->qgroup_ref_list);
+ oper->ref_root = ref_root;
+ oper->bytenr = bytenr;
+ oper->num_bytes = num_bytes;
+ oper->type = type;
+ oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+ INIT_LIST_HEAD(&oper->elem.list);
+ oper->elem.seq = 0;
+ ret = insert_qgroup_oper(fs_info, oper);
+ if (ret) {
+ /* Shouldn't happen so have an assert for developers */
+ ASSERT(0);
+ kfree(oper);
+ return ret;
+ }
+ list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+ if (mod_seq)
+ btrfs_get_tree_mod_seq(fs_info, &oper->elem);
return 0;
}
-static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq)
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct btrfs_qgroup *qgroup;
+ struct ulist *tmp;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int sign = 0;
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root)
+ goto out;
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+ if (!qgroup)
+ goto out;
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ sign = 1;
+ break;
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ sign = -1;
+ break;
+ default:
+ ASSERT(0);
+ }
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+ qgroup->excl += sign * oper->num_bytes;
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+ qgroup->excl += sign * oper->num_bytes;
+ if (sign < 0)
+ WARN_ON(qgroup->excl < oper->num_bytes);
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, struct ulist *tmp,
+ struct ulist *roots, struct ulist *qgroups,
+ u64 seq, int *old_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1211,256 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ /* We don't count our current root here */
+ if (unode->val == root_to_skip)
+ continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
+ /*
+ * We could have a pending removal of this same ref so we may
+ * not have actually found our ref root when doing
+ * btrfs_find_all_roots, so we need to keep track of how many
+ * old roots we find in case we removed ours and added a
+ * different one at the same time. I don't think this could
+ * happen in practice but that sort of thinking leads to pain
+ * and suffering and to the dark side.
+ */
+ (*old_roots)++;
ulist_reinit(tmp);
- /* XXX id not needed */
- ret = ulist_add(tmp, qg->qgroupid,
- (u64)(uintptr_t)qg, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->refcnt < seq)
- qg->refcnt = seq + 1;
+ qg = u64_to_ptr(tmp_unode->aux);
+ /*
+ * We use this sequence number to keep from having to
+ * run the whole list and 0 out the refcnt every time.
+ * We basically use sequnce as the known 0 count and
+ * then add 1 everytime we see a qgroup. This is how we
+ * get how many of the roots actually point up to the
+ * upper level qgroups in order to determine exclusive
+ * counts.
+ *
+ * For rescan we want to set old_refcnt to seq so our
+ * exclusive calculations end up correct.
+ */
+ if (rescan)
+ qg->old_refcnt = seq;
+ else if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
else
- ++qg->refcnt;
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
ret = ulist_add(tmp, glist->group->qgroupid,
- (u64)(uintptr_t)glist->group,
+ ptr_to_u64(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
}
+ return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct ulist *tmp,
+ struct ulist *qgroups, u64 seq,
+ int *old_roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_operation *tmp_oper;
+ struct rb_node *n;
+ int ret;
+
+ ulist_reinit(tmp);
+ /*
+ * We only walk forward in the tree since we're only interested in
+ * removals that happened _after_ our operation.
+ */
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ return 0;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ while (tmp_oper->bytenr == oper->bytenr) {
+ /*
+ * If it's not a removal we don't care, additions work out
+ * properly with our refcnt tracking.
+ */
+ if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+ tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+ goto next;
+ qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+ if (!qg)
+ goto next;
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret) {
+ if (ret < 0)
+ return ret;
+ /*
+ * We only want to increase old_roots if this qgroup is
+ * not already in the list of qgroups. If it is already
+ * there then that means it must have been re-added or
+ * the delete will be discarded because we had an
+ * existing ref that we haven't looked up yet. In this
+ * case we don't want to increase old_roots. So if ret
+ * == 1 then we know that this is the first time we've
+ * seen this qgroup and we can bump the old_roots.
+ */
+ (*old_roots)++;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+next:
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&tmp_oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ break;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ }
+
+ /* Ok now process the qgroups we found */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
return 0;
}
-static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes,
- struct btrfs_qgroup *qgroup)
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct btrfs_qgroup *qgroup,
+ struct ulist *tmp, struct ulist *qgroups,
+ u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
int ret;
ulist_reinit(tmp);
- ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
if (ret < 0)
return ret;
-
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
- if (qg->refcnt < seq) {
- /* not visited by step 1 */
- qg->rfer += sgn * num_bytes;
- qg->rfer_cmpr += sgn * num_bytes;
- if (roots->nnodes == 0) {
- qg->excl += sgn * num_bytes;
- qg->excl_cmpr += sgn * num_bytes;
- }
- qgroup_dirty(fs_info, qg);
- }
- WARN_ON(qg->tag >= seq);
- qg->tag = seq;
+ struct btrfs_qgroup_list *glist;
+ qg = u64_to_ptr(unode->aux);
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ } else {
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ }
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
-
return 0;
}
-static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes)
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, u64 num_bytes,
+ struct ulist *qgroups, u64 seq,
+ int old_roots, int new_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
- int ret;
+ u64 cur_new_count, cur_old_count;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
+ while ((unode = ulist_next(qgroups, &uiter))) {
+ bool dirty = false;
- ulist_reinit(tmp);
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qg = u64_to_ptr(unode->aux);
+ /*
+ * Wasn't referenced before but is now, add to the reference
+ * counters.
+ */
+ if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ qg->rfer += num_bytes;
+ qg->rfer_cmpr += num_bytes;
+ dirty = true;
+ }
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
- struct btrfs_qgroup_list *glist;
+ /*
+ * Was referenced before but isn't now, subtract from the
+ * reference counters.
+ */
+ if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ qg->rfer -= num_bytes;
+ qg->rfer_cmpr -= num_bytes;
+ dirty = true;
+ }
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->tag == seq)
- continue;
+ if (qg->old_refcnt < seq)
+ cur_old_count = 0;
+ else
+ cur_old_count = qg->old_refcnt - seq;
+ if (qg->new_refcnt < seq)
+ cur_new_count = 0;
+ else
+ cur_new_count = qg->new_refcnt - seq;
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl -= sgn * num_bytes;
- qg->excl_cmpr -= sgn * num_bytes;
- qgroup_dirty(fs_info, qg);
- }
+ /*
+ * If our refcount was the same as the roots previously but our
+ * new count isn't the same as the number of roots now then we
+ * went from having a exclusive reference on this range to not.
+ */
+ if (old_roots && cur_old_count == old_roots &&
+ (cur_new_count != new_roots || new_roots == 0)) {
+ WARN_ON(cur_new_count != new_roots && new_roots == 0);
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
+ /*
+ * If we didn't reference all the roots before but now we do we
+ * have an exclusive reference to this range.
+ */
+ if ((!old_roots || (old_roots && cur_old_count != old_roots))
+ && cur_new_count == new_roots) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
}
- }
+ if (dirty)
+ qgroup_dirty(fs_info, qg);
+ }
return 0;
}
/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr. If we do then we can just discard this operation.
*/
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
{
- struct btrfs_root *quota_root;
- u64 ref_root;
- struct btrfs_qgroup *qgroup;
struct ulist *roots = NULL;
- u64 seq;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
int ret = 0;
- int sgn;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ oper->elem.seq, &roots);
+ if (ret < 0)
+ return ret;
+ ret = 0;
- if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
- node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- struct btrfs_delayed_tree_ref *ref;
- ref = btrfs_delayed_node_to_tree_ref(node);
- ref_root = ref->root;
- } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
- node->type == BTRFS_SHARED_DATA_REF_KEY) {
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
- ref_root = ref->root;
- } else {
- BUG();
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ if (unode->val == oper->ref_root) {
+ ret = 1;
+ break;
+ }
}
+ ulist_free(roots);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- if (!is_fstree(ref_root)) {
- /*
- * non-fs-trees are not being accounted
- */
- return 0;
- }
+ return ret;
+}
- switch (node->action) {
- case BTRFS_ADD_DELAYED_REF:
- case BTRFS_ADD_DELAYED_EXTENT:
- sgn = 1;
- seq = btrfs_tree_mod_seq_prev(node->seq);
- break;
- case BTRFS_DROP_DELAYED_REF:
- sgn = -1;
- seq = node->seq;
- break;
- case BTRFS_UPDATE_DELAYED_HEAD:
- return 0;
- default:
- BUG();
- }
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters. The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count. Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count. This means that if
+ * anybody is equal to or below this sequence they were never referenced. We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently. This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently. We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ * [qgroup 1/0]
+ * / | \
+ * [qg 0/0] [qg 0/1] [qg 0/2]
+ * \ | /
+ * [ extent ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0. The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2. Because it is adding new_roots will be 1. We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes. Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist *qgroups, *tmp;
+ struct btrfs_qgroup *qgroup;
+ struct seq_list elem = {};
+ u64 seq;
+ int old_roots = 0;
+ int new_roots = 0;
+ int ret = 0;
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+ if (oper->elem.seq) {
+ ret = check_existing_refs(trans, fs_info, oper);
+ if (ret < 0)
+ return ret;
+ if (ret)
return 0;
- }
}
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- /*
- * the delayed ref sequence number we pass depends on the direction of
- * the operation. for add operations, we pass
- * tree_mod_log_prev_seq(node->seq) to skip
- * the delayed ref's current sequence number, because we need the state
- * of the tree before the add operation. for delete operations, we pass
- * (node->seq) to include the delayed ref's current sequence number,
- * because we need the state of the tree after the delete operation.
- */
- ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
- if (ret < 0)
- return ret;
-
- spin_lock(&fs_info->qgroup_lock);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root)
- goto unlock;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp) {
+ ulist_free(qgroups);
+ return -ENOMEM;
+ }
- qgroup = find_qgroup_rb(fs_info, ref_root);
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+ &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0) {
+ ulist_free(qgroups);
+ ulist_free(tmp);
+ return ret;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
if (!qgroup)
- goto unlock;
+ goto out;
+ seq = fs_info->qgroup_seq;
/*
- * step 1: for each old ref, visit all nodes once and inc refcnt
+ * So roots is the list of all the roots currently pointing at the
+ * bytenr, including the ref we are adding if we are adding, or not if
+ * we are removing a ref. So we pass in the ref_root to skip that root
+ * in our calculations. We set old_refnct and new_refcnt cause who the
+ * hell knows what everything looked like before, and it doesn't matter
+ * except...
*/
- ulist_reinit(fs_info->qgroup_ulist);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+ ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+ seq, &old_roots, 0);
+ if (ret < 0)
+ goto out;
- ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
- seq);
- if (ret)
- goto unlock;
+ /*
+ * Now adjust the refcounts of the qgroups that care about this
+ * reference, either the old_count in the case of removal or new_count
+ * in the case of an addition.
+ */
+ ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+ seq);
+ if (ret < 0)
+ goto out;
/*
- * step 2: walk from the new root
+ * ...in the case of removals. If we had a removal before we got around
+ * to processing this operation then we need to find that guy and count
+ * his references as if they really existed so we don't end up screwing
+ * up the exclusive counts. Then whenever we go to process the delete
+ * everything will be grand and we can account for whatever exclusive
+ * changes need to be made there. We also have to pass in old_roots so
+ * we have an accurate count of the roots as it pertains to this
+ * operations view of the world.
*/
- ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes, qgroup);
- if (ret)
- goto unlock;
+ ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+ &old_roots);
+ if (ret < 0)
+ goto out;
/*
- * step 3: walk again from old refs
+ * We are adding our root, need to adjust up the number of roots,
+ * otherwise old_roots is the number of roots we want.
*/
- ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes);
- if (ret)
- goto unlock;
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ new_roots = old_roots + 1;
+ } else {
+ new_roots = old_roots;
+ old_roots++;
+ }
+ fs_info->qgroup_seq += old_roots + 1;
-unlock:
+
+ /*
+ * And now the magic happens, bless Arne for having a pretty elegant
+ * solution for this.
+ */
+ qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+ qgroups, seq, old_roots, new_roots, 0);
+out:
spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(qgroups);
ulist_free(roots);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ BUG_ON(!fs_info->quota_root);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ ASSERT(is_fstree(oper->ref_root));
+
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ ret = qgroup_excl_accounting(fs_info, oper);
+ break;
+ case BTRFS_QGROUP_OPER_ADD_SHARED:
+ case BTRFS_QGROUP_OPER_SUB_SHARED:
+ ret = qgroup_shared_accounting(trans, fs_info, oper);
+ break;
+ default:
+ ASSERT(0);
+ }
+ return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup_operation *oper;
+ int ret = 0;
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ oper = list_first_entry(&trans->qgroup_ref_list,
+ struct btrfs_qgroup_operation, list);
+ list_del_init(&oper->list);
+ if (!ret || !trans->aborted)
+ ret = btrfs_qgroup_account(trans, fs_info, oper);
+ spin_lock(&fs_info->qgroup_op_lock);
+ rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+ kfree(oper);
+ }
return ret;
}
@@ -1509,8 +1996,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
ret = 0;
}
@@ -1629,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
- dstgroup->rfer = srcgroup->rfer - level_size;
- dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+
+ /*
+ * We call inherit after we clone the root in order to make sure
+ * our counts don't go crazy, so at this point the only
+ * difference between the two roots should be the root node.
+ */
+ dstgroup->rfer = srcgroup->rfer;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+ dstgroup->excl = level_size;
+ dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
qgroup_dirty(fs_info, dstgroup);
@@ -1734,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1766,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved += num_bytes;
}
@@ -1812,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved -= num_bytes;
@@ -1848,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *tmp,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans, struct ulist *qgroups,
+ struct ulist *tmp, struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct seq_list tree_mod_seq_elem = {};
+ u64 num_bytes;
u64 seq;
+ int new_roots;
int slot;
int ret;
@@ -1897,8 +2392,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
- u64 num_bytes;
-
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
@@ -1908,76 +2401,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
- tree_mod_seq_elem.seq, &roots);
+ ulist_reinit(qgroups);
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+ &roots);
if (ret < 0)
goto out;
spin_lock(&fs_info->qgroup_lock);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
- if (ret) {
+ new_roots = 0;
+ ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+ seq, &new_roots, 1);
+ if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
goto out;
}
- /*
- * step2 of btrfs_qgroup_account_ref works from a single root,
- * we're doing all at once here.
- */
- ulist_reinit(tmp);
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- struct btrfs_qgroup *qg;
-
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
-
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
-
- /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
- qg->rfer += num_bytes;
- qg->rfer_cmpr += num_bytes;
- WARN_ON(qg->tag >= seq);
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- }
- qgroup_dirty(fs_info, qg);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
+ ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+ seq, 0, new_roots, 1);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
}
-
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
- ret = 0;
}
-
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -1990,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL;
+ struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
path = btrfs_alloc_path();
if (!path)
goto out;
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ goto out;
tmp = ulist_alloc(GFP_NOFS);
if (!tmp)
goto out;
@@ -2015,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- tmp, scratch_leaf);
+ qgroups, tmp, scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2025,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
+ ulist_free(qgroups);
ulist_free(tmp);
btrfs_free_path(path);
@@ -2095,7 +2550,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
- fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+ btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_worker, NULL, NULL);
if (ret) {
err:
@@ -2158,8 +2614,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
return 0;
}
@@ -2190,6 +2646,6 @@ void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 00000000000..5952ff1fbd7
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction. The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding. This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols. This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+ BTRFS_QGROUP_OPER_ADD_EXCL,
+ BTRFS_QGROUP_OPER_ADD_SHARED,
+ BTRFS_QGROUP_OPER_SUB_EXCL,
+ BTRFS_QGROUP_OPER_SUB_SHARED,
+};
+
+struct btrfs_qgroup_operation {
+ u64 ref_root;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 seq;
+ enum btrfs_qgroup_operation_type type;
+ struct seq_list elem;
+ struct rb_node n;
+ struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type,
+ int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991..4a88f073fdd 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = rmw_work;
+ btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = read_rebuild_work;
+ btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- plug->work.flags = 0;
- plug->work.func = unplug_work;
- btrfs_queue_worker(&plug->info->rmw_workers,
- &plug->work);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+ btrfs_queue_work(plug->info->rmw_workers,
+ &plug->work);
return;
}
run_plug(plug);
@@ -1959,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* pages are going to be uptodate.
*/
for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
- if (rbio->faila == stripe ||
- rbio->failb == stripe)
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->bbio->error);
continue;
+ }
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
struct page *p;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3..09230cf3a24 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
continue;
}
if (!dev->bdev) {
- /* cannot read ahead on missing device */
- continue;
+ /*
+ * cannot read ahead on missing device, but for RAID5/6,
+ * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+ * device for such case.
+ */
+ if (nzones > 1)
+ continue;
}
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
@@ -793,10 +798,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- rmw->work.func = reada_start_machine_worker;
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
- btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+ btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
}
#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40e..65245a07275 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
if (bnode->root)
fs_info = bnode->root->fs_info;
btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
- "found at offset %llu\n", bytenr);
+ "found at offset %llu", bytenr);
}
/*
@@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
reloc_root = root->reloc_root;
@@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, root_objectid);
BUG_ON(IS_ERR(root));
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
generation != btrfs_root_generation(&root->root_item))
return NULL;
@@ -887,7 +887,7 @@ again:
goto out;
}
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
cur->cowonly = 1;
if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -954,7 +954,8 @@ again:
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
upper->cowonly = 1;
/*
@@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (rb_node) {
btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
"for start=%llu while inserting into relocation "
- "tree\n", node->bytenr);
+ "tree", node->bytenr);
kfree(node);
return -EEXIST;
}
@@ -2317,7 +2318,6 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
u64 last_snap;
@@ -2375,26 +2375,6 @@ again:
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
- } else if (!ret) {
- /*
- * recover the last snapshot tranid to avoid
- * the space balance break NOCOW.
- */
- root = read_fs_root(rc->extent_root->fs_info,
- objectid);
- if (IS_ERR(root))
- continue;
-
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
-
- /* Check if the fs/file tree was snapshoted or not. */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- otransid - 1)
- btrfs_set_root_last_snapshot(&root->root_item,
- last_snap);
-
- btrfs_end_transaction(trans, root);
}
}
@@ -2462,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
- BUG_ON(!root->ref_cows);
+ BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
@@ -2527,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
BUG_ON(!root);
/* no other choice for non-references counted tree */
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2914,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
- if (!root || root->ref_cows) {
+ if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = reserve_metadata_space(trans, rc, node);
if (ret)
goto out;
}
if (root) {
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
BUG_ON(node->new_bytenr);
BUG_ON(!list_empty(&node->list));
btrfs_record_root_in_trans(trans, root);
@@ -4248,7 +4228,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
rc->block_group->key.objectid, rc->block_group->flags);
- ret = btrfs_start_delalloc_roots(fs_info, 0);
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
if (ret < 0) {
err = ret;
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059d..360a728a639 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.offset++;
root = btrfs_read_fs_root(tree_root, &root_key);
- err = PTR_RET(root);
+ err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
} else if (err == -ENOENT) {
@@ -305,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
break;
}
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
if (err) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282e..b6d198f5181 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
atomic_inc(&fs_info->scrubs_running);
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
+
+ /*
+ * check if @scrubs_running=@scrubs_paused condition
+ * inside wait_event() is not an atomic operation.
+ * which means we may inc/dec @scrub_running/paused
+ * at any time. Let's wake up @scrub_pause_wait as
+ * much as we can to let commit transaction blocked less.
+ */
+ wake_up(&fs_info->scrub_pause_wait);
+
atomic_inc(&sctx->workers_pending);
}
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- sbio->work.func = scrub_bio_end_io_worker;
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+ NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -577,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
- ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
- &ref_root, &ref_level);
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
printk_in_rcu(KERN_WARNING
"BTRFS: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
@@ -706,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
out:
if (page)
put_page(page);
- if (inode)
- iput(inode);
+
+ iput(inode);
if (ret < 0)
return ret;
@@ -987,9 +999,10 @@ nodatasum_case:
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
- fixup_nodatasum->work.func = scrub_fixup_nodatasum;
- btrfs_queue_worker(&fs_info->scrub_workers,
- &fixup_nodatasum->work);
+ btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+ NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_workers,
+ &fixup_nodatasum->work);
goto out;
}
@@ -1603,8 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- sbio->work.func = scrub_wr_bio_end_io_worker;
- btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2085,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
}
static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2223,6 +2236,47 @@ behind_scrub_pages:
return 0;
}
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+ struct map_lookup *map, u64 *offset)
+{
+ int i;
+ int j = 0;
+ u64 stripe_nr;
+ u64 last_offset;
+ int stripe_index;
+ int rot;
+
+ last_offset = (physical - map->stripes[num].physical) *
+ nr_data_stripes(map);
+ *offset = last_offset;
+ for (i = 0; i < nr_data_stripes(map); i++) {
+ *offset = last_offset + i * map->stripe_len;
+
+ stripe_nr = *offset;
+ do_div(stripe_nr, map->stripe_len);
+ do_div(stripe_nr, nr_data_stripes(map));
+
+ /* Work out the disk rotation on this stripe-set */
+ rot = do_div(stripe_nr, map->num_stripes);
+ /* calculate which stripe this data locates */
+ rot += i;
+ stripe_index = rot % map->num_stripes;
+ if (stripe_index == num)
+ return 0;
+ if (stripe_index < num)
+ j++;
+ }
+ *offset = last_offset + j * map->stripe_len;
+ return 1;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
@@ -2244,6 +2298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical;
u64 logical;
u64 logic_end;
+ u64 physical_end;
u64 generation;
int mirror_num;
struct reada_control *reada1;
@@ -2257,16 +2312,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_len;
struct btrfs_device *extent_dev;
int extent_mirror_num;
- int stop_loop;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- if (num >= nr_data_stripes(map)) {
- return 0;
- }
- }
+ int stop_loop = 0;
nstripes = length;
+ physical = map->stripes[num].physical;
offset = 0;
do_div(nstripes, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2284,6 +2333,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical, num, map, &offset);
+ increment = map->stripe_len * nr_data_stripes(map);
+ mirror_num = 1;
} else {
increment = map->stripe_len;
mirror_num = 1;
@@ -2307,7 +2361,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* to not hold off transaction commits
*/
logical = base + offset;
-
+ physical_end = physical + nstripes * map->stripe_len;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical_end, num,
+ map, &logic_end);
+ logic_end += base;
+ } else {
+ logic_end = logical + increment * nstripes;
+ }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
@@ -2316,7 +2378,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.objectid = logical;
key_start.type = BTRFS_EXTENT_ITEM_KEY;
key_start.offset = (u64)0;
- key_end.objectid = base + offset + nstripes * increment;
+ key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2326,7 +2388,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = base + offset + nstripes * increment;
+ key_end.offset = logic_end;
reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
if (!IS_ERR(reada1))
@@ -2344,11 +2406,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset;
- physical = map->stripes[num].physical;
- logic_end = logical + increment * nstripes;
ret = 0;
- while (logical < logic_end) {
+ while (physical < physical_end) {
+ /* for raid56, we skip parity stripe */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ ret = get_raid56_logic_offset(physical, num,
+ map, &logical);
+ logical += base;
+ if (ret)
+ goto skip;
+ }
/*
* canceled?
*/
@@ -2492,15 +2560,29 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- logical += increment;
- physical += map->stripe_len;
-
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ /*
+ * loop until we find next data stripe
+ * or we have finished all stripes.
+ */
+ do {
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(
+ physical, num,
+ map, &logical);
+ logical += base;
+ } while (physical < physical_end && ret);
+ } else {
+ physical += map->stripe_len;
+ logical += increment;
+ }
if (logical < key.objectid + bytes) {
cond_resched();
goto again;
}
- if (logical >= logic_end) {
+ if (physical >= physical_end) {
stop_loop = 1;
break;
}
@@ -2509,6 +2591,7 @@ next:
path->slots[0]++;
}
btrfs_release_path(path);
+skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
@@ -2642,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start) {
- key.offset = found_key.offset + length;
- btrfs_release_path(path);
- continue;
- }
+ if (found_key.offset + length <= start)
+ goto skip;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2657,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -2686,10 +2768,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ /*
+ * must be called before we decrease @scrub_paused.
+ * make sure we don't block transaction commit while
+ * we are waiting pending workers finished.
+ */
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
- scrub_blocked_if_needed(fs_info);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
btrfs_put_block_group(cache);
if (ret)
@@ -2706,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
-
+skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
@@ -2757,33 +2852,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
int ret = 0;
+ int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ int max_active = fs_info->thread_pool_size;
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
- btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
- &fs_info->generic_worker);
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ 1, 4);
else
- btrfs_init_workers(&fs_info->scrub_workers, "scrub",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_workers.idle_thresh = 4;
- ret = btrfs_start_workers(&fs_info->scrub_workers);
- if (ret)
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ max_active, 4);
+ if (!fs_info->scrub_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
- "scrubwrc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_wr_completion_workers.idle_thresh = 2;
- ret = btrfs_start_workers(
- &fs_info->scrub_wr_completion_workers);
- if (ret)
+ }
+ fs_info->scrub_wr_completion_workers =
+ btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ max_active, 2);
+ if (!fs_info->scrub_wr_completion_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
- &fs_info->generic_worker);
- ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
- if (ret)
+ }
+ fs_info->scrub_nocow_workers =
+ btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ if (!fs_info->scrub_nocow_workers) {
+ ret = -ENOMEM;
goto out;
+ }
}
++fs_info->scrub_workers_refcnt;
out:
@@ -2793,9 +2890,9 @@ out:
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_stop_workers(&fs_info->scrub_workers);
- btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
- btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
@@ -3106,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->len = len;
nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
- nocow_ctx->work.func = copy_nocow_pages_worker;
+ btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
INIT_LIST_HEAD(&nocow_ctx->inodes);
- btrfs_queue_worker(&fs_info->scrub_nocow_workers,
- &nocow_ctx->work);
+ btrfs_queue_work(fs_info->scrub_nocow_workers,
+ &nocow_ctx->work);
return 0;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 730dce39585..6528aa66218 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,12 +24,12 @@
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
#include <linux/radix-tree.h>
-#include <linux/crc32c.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include "send.h"
#include "backref.h"
+#include "hash.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
@@ -51,15 +51,18 @@ struct fs_path {
struct {
char *start;
char *end;
- char *prepared;
char *buf;
- int buf_len;
- unsigned int reversed:1;
- unsigned int virtual_mem:1;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
char inline_buf[];
};
- char pad[PAGE_SIZE];
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * a allocation later during send.
+ */
+ char pad[256];
};
};
#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
+ u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
struct list_head name_cache_list;
int name_cache_size;
+ struct file_ra_state ra;
+
char *read_buf;
/*
@@ -175,6 +181,47 @@ struct send_ctx {
* own move/rename can be performed.
*/
struct rb_root waiting_dir_moves;
+
+ /*
+ * A directory that is going to be rm'ed might have a child directory
+ * which is in the pending directory moves index above. In this case,
+ * the directory can only be removed after the move/rename of its child
+ * is performed. Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- c/ (ino 259)
+ * | |-- x/ (ino 260)
+ * |
+ * |-- y/ (ino 261)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- YY/ (ino 261)
+ * |-- x/ (ino 260)
+ *
+ * Sequence of steps that lead to the send snapshot:
+ * rm -f /a/b/c/foo.txt
+ * mv /a/b/y /a/b/YY
+ * mv /a/b/c/x /a/b/YY
+ * rmdir /a/b/c
+ *
+ * When the child is processed, its move/rename is delayed until its
+ * parent is processed (as explained above), but all other operations
+ * like update utimes, chown, chgrp, etc, are performed and the paths
+ * that it uses for those operations must use the orphanized name of
+ * its parent (the directory we're going to rm later), so we need to
+ * memorize that name.
+ *
+ * Indexed by the inode number of the directory to be deleted.
+ */
+ struct rb_root orphan_dirs;
};
struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
struct waiting_dir_move {
struct rb_node node;
u64 ino;
+ /*
+ * There might be some directory that could not be removed because it
+ * was waiting for this directory inode to be moved first. Therefore
+ * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+ */
+ u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+ struct rb_node node;
+ u64 ino;
+ u64 gen;
};
struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
static int need_send_hole(struct send_ctx *sctx)
{
return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
if (!p)
return NULL;
p->reversed = 0;
- p->virtual_mem = 0;
p->buf = p->inline_buf;
p->buf_len = FS_PATH_INLINE_SIZE;
fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
- if (p->buf != p->inline_buf) {
- if (p->virtual_mem)
- vfree(p->buf);
- else
- kfree(p->buf);
- }
+ if (p->buf != p->inline_buf)
+ kfree(p->buf);
kfree(p);
}
@@ -290,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
+ if (len > PATH_MAX) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
path_len = p->end - p->start;
old_buf_len = p->buf_len;
- len = PAGE_ALIGN(len);
+ /*
+ * First time the inline_buf does not suffice
+ */
if (p->buf == p->inline_buf) {
- tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- p->virtual_mem = 1;
- }
- memcpy(tmp_buf, p->buf, p->buf_len);
- p->buf = tmp_buf;
- p->buf_len = len;
+ tmp_buf = kmalloc(len, GFP_NOFS);
+ if (tmp_buf)
+ memcpy(tmp_buf, p->buf, old_buf_len);
} else {
- if (p->virtual_mem) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- vfree(p->buf);
- } else {
- tmp_buf = krealloc(p->buf, len, GFP_NOFS);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- kfree(p->buf);
- p->virtual_mem = 1;
- }
- }
- p->buf = tmp_buf;
- p->buf_len = len;
+ tmp_buf = krealloc(p->buf, len, GFP_NOFS);
}
+ if (!tmp_buf)
+ return -ENOMEM;
+ p->buf = tmp_buf;
+ /*
+ * The real size of the buffer is bigger, this will let the fast path
+ * happen most of the time
+ */
+ p->buf_len = ksize(p->buf);
+
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
p->end = p->buf + p->buf_len - 1;
@@ -338,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
return 0;
}
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+ char **prepared)
{
int ret;
int new_len;
@@ -354,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
if (p->start != p->end)
*--p->start = '/';
p->start -= name_len;
- p->prepared = p->start;
+ *prepared = p->start;
} else {
if (p->start != p->end)
*p->end++ = '/';
- p->prepared = p->end;
+ *prepared = p->end;
p->end += name_len;
*p->end = 0;
}
@@ -370,12 +421,12 @@ out:
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, name_len);
+ ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, name, name_len);
- p->prepared = NULL;
+ memcpy(prepared, name, name_len);
out:
return ret;
@@ -384,12 +435,12 @@ out:
static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+ ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, p2->start, p2->end - p2->start);
- p->prepared = NULL;
+ memcpy(prepared, p2->start, p2->end - p2->start);
out:
return ret;
@@ -400,13 +451,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
unsigned long off, int len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, len);
+ ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
goto out;
- read_extent_buffer(eb, p->prepared, off, len);
- p->prepared = NULL;
+ read_extent_buffer(eb, prepared, off, len);
out:
return ret;
@@ -450,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void)
return NULL;
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->need_commit_sem = 1;
return path;
}
@@ -620,7 +672,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
- crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -728,29 +780,22 @@ out:
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+ u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+ u64 *gid, u64 *rdev)
{
int ret;
struct btrfs_inode_item *ii;
struct btrfs_key key;
- struct btrfs_path *path;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
if (ret) {
- ret = -ENOENT;
- goto out;
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
}
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -768,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root,
if (rdev)
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
-out:
+ return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+ u64 ino, u64 *size, u64 *gen,
+ u64 *mode, u64 *uid, u64 *gid,
+ u64 *rdev)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+ ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+ rdev);
btrfs_free_path(path);
return ret;
}
@@ -915,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
- char *buf2 = NULL;
int buf_len;
- int buf_virtual = 0;
u32 name_len;
u32 data_len;
u32 cur;
@@ -927,7 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- buf_len = PAGE_SIZE;
+ if (found_key->type == BTRFS_XATTR_ITEM_KEY)
+ buf_len = BTRFS_MAX_XATTR_SIZE(root);
+ else
+ buf_len = PATH_MAX;
+
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -949,30 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (name_len + data_len > buf_len) {
- buf_len = PAGE_ALIGN(name_len + data_len);
- if (buf_virtual) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- vfree(buf);
- } else {
- buf2 = krealloc(buf, buf_len, GFP_NOFS);
- if (!buf2) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- kfree(buf);
- buf_virtual = 1;
- }
+ if (type == BTRFS_FT_XATTR) {
+ if (name_len > XATTR_NAME_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ if (name_len + data_len > buf_len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /*
+ * Path too long
+ */
+ if (name_len + data_len > buf_len) {
+ ret = -ENAMETOOLONG;
+ goto out;
}
-
- buf = buf2;
- buf2 = NULL;
}
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1050,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- if (buf_virtual)
- vfree(buf);
- else
- kfree(buf);
+ kfree(buf);
return ret;
}
@@ -1066,6 +1118,7 @@ out:
struct backref_ctx {
struct send_ctx *sctx;
+ struct btrfs_path *path;
/* number of total found references */
u64 found;
@@ -1136,8 +1189,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
- NULL);
+ ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+ NULL, NULL, NULL);
+ btrfs_release_path(bctx->path);
if (ret < 0)
return ret;
@@ -1216,12 +1270,17 @@ static int find_extent_clone(struct send_ctx *sctx,
if (!tmp_path)
return -ENOMEM;
+ /* We only use this path under the commit sem */
+ tmp_path->need_commit_sem = 0;
+
backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
if (!backref_ctx) {
ret = -ENOMEM;
goto out;
}
+ backref_ctx->path = tmp_path;
+
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1249,8 +1308,10 @@ static int find_extent_clone(struct send_ctx *sctx,
}
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+ down_read(&sctx->send_root->fs_info->commit_root_sem);
ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
&found_key, &flags);
+ up_read(&sctx->send_root->fs_info->commit_root_sem);
btrfs_release_path(tmp_path);
if (ret < 0)
@@ -1292,8 +1353,6 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = logical - found_key.objectid;
else
extent_item_pos = 0;
-
- extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(sctx->send_root->fs_info,
found_key.objectid, extent_item_pos, 1,
__iterate_backrefs, backref_ctx);
@@ -1306,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -EIO;
btrfs_err(sctx->send_root->fs_info, "did not find backref in "
"send_root. inode=%llu, offset=%llu, "
- "disk_byte=%llu found extent=%llu\n",
+ "disk_byte=%llu found extent=%llu",
ino, data_offset, disk_byte, found_key.objectid);
goto out;
}
@@ -1332,6 +1391,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
+ if (compressed != BTRFS_COMPRESS_NONE) {
+ /*
+ * Offsets given by iterate_extent_inodes() are relative
+ * to the start of the extent, we need to add logical
+ * offset from the file extent item.
+ * (See why at backref.c:check_extent_in_eb())
+ */
+ cur_clone_root->offset += btrfs_file_extent_offset(eb,
+ fi);
+ }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1408,11 +1477,7 @@ static int gen_unique_name(struct send_ctx *sctx,
while (1) {
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
- if (len >= sizeof(tmp)) {
- /* should really not happen */
- ret = -EOVERFLOW;
- goto out;
- }
+ ASSERT(len < sizeof(tmp));
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1579,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = -ENOENT;
+ goto out;
+ }
*found_inode = key.objectid;
*found_type = btrfs_dir_type(path->nodes[0], di);
@@ -1622,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
}
- if (key.type == BTRFS_INODE_REF_KEY) {
+ if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
@@ -1644,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
btrfs_release_path(path);
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
+ if (dir_gen) {
+ ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ }
*dir = parent_dir;
@@ -1663,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,
int ret;
struct fs_path *tmp_name;
u64 tmp_dir;
- u64 tmp_dir_gen;
tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
if (ret < 0)
goto out;
@@ -1888,13 +1958,20 @@ static void name_cache_delete(struct send_ctx *sctx,
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
- BUG_ON(!nce_head);
+ if (!nce_head) {
+ btrfs_err(sctx->send_root->fs_info,
+ "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+ nce->ino, sctx->name_cache_size);
+ }
list_del(&nce->radix_list);
list_del(&nce->list);
sctx->name_cache_size--;
- if (list_empty(nce_head)) {
+ /*
+ * We may not get to the final release of nce_head if the lookup fails
+ */
+ if (nce_head && list_empty(nce_head)) {
radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
@@ -1967,18 +2044,14 @@ static void name_cache_free(struct send_ctx *sctx)
*/
static int __get_cur_name_and_parent(struct send_ctx *sctx,
u64 ino, u64 gen,
- int skip_name_cache,
u64 *parent_ino,
u64 *parent_gen,
struct fs_path *dest)
{
int ret;
int nce_ret;
- struct btrfs_path *path = NULL;
struct name_cache_entry *nce = NULL;
- if (skip_name_cache)
- goto get_ref;
/*
* First check if we already did a call to this function with the same
* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2002,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
}
}
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
@@ -2023,12 +2092,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
goto out_cache;
}
-get_ref:
/*
* Depending on whether the inode was already processed or not, use
* send_root or parent_root for ref lookup.
*/
- if (ino < sctx->send_progress && !skip_name_cache)
+ if (ino < sctx->send_progress)
ret = get_first_ref(sctx->send_root, ino,
parent_ino, parent_gen, dest);
else
@@ -2052,8 +2120,6 @@ get_ref:
goto out;
ret = 1;
}
- if (skip_name_cache)
- goto out;
out_cache:
/*
@@ -2084,7 +2150,6 @@ out_cache:
name_cache_clean_unused(sctx);
out:
- btrfs_free_path(path);
return ret;
}
@@ -2121,9 +2186,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
- u64 start_ino = ino;
- u64 start_gen = gen;
- int skip_name_cache = 0;
name = fs_path_alloc();
if (!name) {
@@ -2131,31 +2193,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
}
- if (is_waiting_for_move(sctx, ino))
- skip_name_cache = 1;
-
-again:
dest->reversed = 1;
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
fs_path_reset(name);
- ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
- &parent_inode, &parent_gen, name);
+ if (is_waiting_for_rm(sctx, ino)) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(dest, name);
+ break;
+ }
+
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret)
+ stop = 1;
+ }
+
if (ret < 0)
goto out;
- if (ret)
- stop = 1;
-
- if (!skip_name_cache &&
- is_waiting_for_move(sctx, parent_inode)) {
- ino = start_ino;
- gen = start_gen;
- stop = 0;
- skip_name_cache = 1;
- goto again;
- }
ret = fs_path_add_path(dest, name);
if (ret < 0)
@@ -2419,10 +2483,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
- NULL, &rdev);
- if (ret < 0)
- goto out;
+ if (ino != sctx->cur_ino) {
+ ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+ NULL, NULL, &rdev);
+ if (ret < 0)
+ goto out;
+ } else {
+ gen = sctx->cur_inode_gen;
+ mode = sctx->cur_inode_mode;
+ rdev = sctx->cur_inode_rdev;
+ }
if (S_ISREG(mode)) {
cmd = BTRFS_SEND_C_MKFILE;
@@ -2502,17 +2572,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
while (1) {
- ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
- 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->send_root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
@@ -2527,8 +2606,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
goto out;
}
- key.offset = found_key.offset + 1;
- btrfs_release_path(path);
+ path->slots[0]++;
}
out:
@@ -2580,7 +2658,7 @@ struct recorded_ref {
* everything mixed. So we first record all refs and later process them.
* This function is a helper to record one ref.
*/
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
u64 dir_gen, struct fs_path *path)
{
struct recorded_ref *ref;
@@ -2666,12 +2744,78 @@ out:
return ret;
}
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node **p = &sctx->orphan_dirs.rb_node;
+ struct rb_node *parent = NULL;
+ struct orphan_dir_info *entry, *odi;
+
+ odi = kmalloc(sizeof(*odi), GFP_NOFS);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (dir_ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(odi);
+ return entry;
+ }
+ }
+
+ rb_link_node(&odi->node, parent, p);
+ rb_insert_color(&odi->node, &sctx->orphan_dirs);
+ return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node *n = sctx->orphan_dirs.rb_node;
+ struct orphan_dir_info *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ n = n->rb_left;
+ else if (dir_ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+ return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+ struct orphan_dir_info *odi)
+{
+ if (!odi)
+ return;
+ rb_erase(&odi->node, &sctx->orphan_dirs);
+ kfree(odi);
+}
+
/*
* Returns 1 if a directory can be removed at this point in time.
* We check this by iterating all dir items and checking if the inode behind
* the dir item was already processed.
*/
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ u64 send_progress)
{
int ret = 0;
struct btrfs_root *root = sctx->parent_root;
@@ -2694,31 +2838,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ struct waiting_dir_move *dm;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type)
break;
- }
di = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dm = get_waiting_dir_move(sctx, loc.objectid);
+ if (dm) {
+ struct orphan_dir_info *odi;
+
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ dm->rmdir_ino = dir;
+ ret = 0;
+ goto out;
+ }
+
if (loc.objectid > send_progress) {
ret = 0;
goto out;
}
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
ret = 1;
@@ -2730,19 +2895,9 @@ out:
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
{
- struct rb_node *n = sctx->waiting_dir_moves.rb_node;
- struct waiting_dir_move *entry;
+ struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
- while (n) {
- entry = rb_entry(n, struct waiting_dir_move, node);
- if (ino < entry->ino)
- n = n->rb_left;
- else if (ino > entry->ino)
- n = n->rb_right;
- else
- return 1;
- }
- return 0;
+ return entry != NULL;
}
static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2755,6 +2910,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
if (!dm)
return -ENOMEM;
dm->ino = ino;
+ dm->rmdir_ino = 0;
while (*p) {
parent = *p;
@@ -2774,35 +2930,43 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return 0;
}
-#ifdef CONFIG_BTRFS_ASSERT
-
-static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
{
struct rb_node *n = sctx->waiting_dir_moves.rb_node;
struct waiting_dir_move *entry;
while (n) {
entry = rb_entry(n, struct waiting_dir_move, node);
- if (ino < entry->ino) {
+ if (ino < entry->ino)
n = n->rb_left;
- } else if (ino > entry->ino) {
+ else if (ino > entry->ino)
n = n->rb_right;
- } else {
- rb_erase(&entry->node, &sctx->waiting_dir_moves);
- kfree(entry);
- return 0;
- }
+ else
+ return entry;
}
- return -ENOENT;
+ return NULL;
}
-#endif
+static void free_waiting_dir_move(struct send_ctx *sctx,
+ struct waiting_dir_move *dm)
+{
+ if (!dm)
+ return;
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+}
-static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+static int add_pending_dir_move(struct send_ctx *sctx,
+ u64 ino,
+ u64 ino_gen,
+ u64 parent_ino,
+ struct list_head *new_refs,
+ struct list_head *deleted_refs)
{
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
struct rb_node *parent = NULL;
- struct pending_dir_move *entry, *pm;
+ struct pending_dir_move *entry = NULL, *pm;
struct recorded_ref *cur;
int exists = 0;
int ret;
@@ -2811,8 +2975,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
if (!pm)
return -ENOMEM;
pm->parent_ino = parent_ino;
- pm->ino = sctx->cur_ino;
- pm->gen = sctx->cur_inode_gen;
+ pm->ino = ino;
+ pm->gen = ino_gen;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -2830,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
}
}
- list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ list_for_each_entry(cur, deleted_refs, list) {
ret = dup_ref(cur, &pm->update_refs);
if (ret < 0)
goto out;
}
- list_for_each_entry(cur, &sctx->new_refs, list) {
+ list_for_each_entry(cur, new_refs, list) {
ret = dup_ref(cur, &pm->update_refs);
if (ret < 0)
goto out;
@@ -2878,31 +3042,105 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
struct fs_path *to_path = NULL;
+ struct fs_path *name = NULL;
u64 orig_progress = sctx->send_progress;
struct recorded_ref *cur;
+ u64 parent_ino, parent_gen;
+ struct waiting_dir_move *dm = NULL;
+ u64 rmdir_ino = 0;
int ret;
+ u64 ancestor = 0;
+ name = fs_path_alloc();
from_path = fs_path_alloc();
- if (!from_path)
- return -ENOMEM;
+ if (!name || !from_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- sctx->send_progress = pm->ino;
- ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ rmdir_ino = dm->rmdir_ino;
+ free_waiting_dir_move(sctx, dm);
+
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
if (ret < 0)
goto out;
- to_path = fs_path_alloc();
- if (!to_path) {
- ret = -ENOMEM;
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ if (ret < 0)
goto out;
- }
sctx->send_progress = sctx->cur_ino + 1;
- ASSERT(del_waiting_dir_move(sctx, pm->ino) == 0);
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs);
+ if (ret < 0)
+ goto out;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ }
+ goto out;
+ }
+ fs_path_reset(name);
+ to_path = name;
+ name = NULL;
ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
if (ret < 0)
goto out;
@@ -2911,6 +3149,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
if (ret < 0)
goto out;
+ if (rmdir_ino) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, rmdir_ino);
+ if (!odi) {
+ /* already deleted */
+ goto finish;
+ }
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ goto finish;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, name);
+ if (ret < 0)
+ goto out;
+ free_orphan_dir_info(sctx, odi);
+ }
+
+finish:
ret = send_utimes(sctx, pm->ino, pm->gen);
if (ret < 0)
goto out;
@@ -2920,12 +3187,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
* and old parent(s).
*/
list_for_each_entry(cur, &pm->update_refs, list) {
+ if (cur->dir == rmdir_ino)
+ continue;
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
}
out:
+ fs_path_free(name);
fs_path_free(from_path);
fs_path_free(to_path);
sctx->send_progress = orig_progress;
@@ -2994,76 +3264,74 @@ out:
static int wait_for_parent_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref)
{
- int ret;
+ int ret = 0;
u64 ino = parent_ref->dir;
u64 parent_ino_before, parent_ino_after;
- u64 new_gen, old_gen;
struct fs_path *path_before = NULL;
struct fs_path *path_after = NULL;
int len1, len2;
- if (parent_ref->dir <= sctx->cur_ino)
- return 0;
-
- if (is_waiting_for_move(sctx, ino))
- return 1;
-
- ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
- NULL, NULL, NULL, NULL);
- if (ret == -ENOENT)
- return 0;
- else if (ret < 0)
- return ret;
-
- ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
- NULL, NULL, NULL, NULL);
- if (ret < 0)
- return ret;
-
- if (new_gen != old_gen)
- return 0;
-
- path_before = fs_path_alloc();
- if (!path_before)
- return -ENOMEM;
-
- ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
- NULL, path_before);
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- } else if (ret < 0) {
- goto out;
- }
-
path_after = fs_path_alloc();
- if (!path_after) {
+ path_before = fs_path_alloc();
+ if (!path_after || !path_before) {
ret = -ENOMEM;
goto out;
}
- ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
- NULL, path_after);
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- } else if (ret < 0) {
- goto out;
- }
+ /*
+ * Our current directory inode may not yet be renamed/moved because some
+ * ancestor (immediate or not) has to be renamed/moved first. So find if
+ * such ancestor exists and make sure our own rename/move happens after
+ * that ancestor is processed.
+ */
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = 1;
+ break;
+ }
- len1 = fs_path_len(path_before);
- len2 = fs_path_len(path_after);
- if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
- memcmp(path_before->start, path_after->start, len1))) {
- ret = 1;
- goto out;
+ fs_path_reset(path_before);
+ fs_path_reset(path_after);
+
+ ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+ NULL, path_after);
+ if (ret < 0)
+ goto out;
+ ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+ NULL, path_before);
+ if (ret < 0 && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 1;
+ break;
+ }
+
+ len1 = fs_path_len(path_before);
+ len2 = fs_path_len(path_after);
+ if (ino > sctx->cur_ino &&
+ (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1))) {
+ ret = 1;
+ break;
+ }
+ ino = parent_ino_after;
}
- ret = 0;
out:
fs_path_free(path_before);
fs_path_free(path_after);
+ if (ret == 1) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ ino,
+ &sctx->new_refs,
+ &sctx->deleted_refs);
+ if (!ret)
+ ret = 1;
+ }
+
return ret;
}
@@ -3081,6 +3349,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
u64 ow_gen;
int did_overwrite = 0;
int is_orphan = 0;
+ u64 last_dir_ino_rm = 0;
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -3219,9 +3488,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- if (wait_for_parent_move(sctx, cur)) {
- ret = add_pending_dir_move(sctx,
- cur->dir);
+ ret = wait_for_parent_move(sctx, cur);
+ if (ret < 0)
+ goto out;
+ if (ret) {
*pending_move = 1;
} else {
ret = send_rename(sctx, valid_path,
@@ -3251,7 +3521,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* later, we do this check again and rmdir it then if possible.
* See the use of check_dirs for more details.
*/
- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3342,8 +3613,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete) {
- ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+ } else if (ret == inode_state_did_delete &&
+ cur->dir != last_dir_ino_rm) {
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3354,6 +3627,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
+ last_dir_ino_rm = cur->dir;
}
}
}
@@ -3367,9 +3641,8 @@ out:
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+ struct fs_path *name, void *ctx, struct list_head *refs)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -3380,7 +3653,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+ ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
NULL, NULL);
if (ret < 0)
goto out;
@@ -3392,7 +3665,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (ret < 0)
goto out;
- ret = record_ref(&sctx->new_refs, dir, gen, p);
+ ret = __record_ref(refs, dir, gen, p);
out:
if (ret)
@@ -3400,37 +3673,23 @@ out:
return ret;
}
+static int __record_new_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ struct send_ctx *sctx = ctx;
+ return record_ref(sctx->send_root, num, dir, index, name,
+ ctx, &sctx->new_refs);
+}
+
+
static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *name,
void *ctx)
{
- int ret = 0;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, dir, gen, p);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(p, name);
- if (ret < 0)
- goto out;
-
- ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
- if (ret)
- fs_path_free(p);
- return ret;
+ return record_ref(sctx->parent_root, num, dir, index, name,
+ ctx, &sctx->deleted_refs);
}
static int record_new_ref(struct send_ctx *sctx)
@@ -3611,21 +3870,31 @@ static int process_all_refs(struct send_ctx *sctx,
root = sctx->parent_root;
cb = __record_deleted_ref;
} else {
- BUG();
+ btrfs_err(sctx->send_root->fs_info,
+ "Wrong command %d in process_all_refs", cmd);
+ ret = -EINVAL;
+ goto out;
}
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret)
- break;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
@@ -3634,11 +3903,10 @@ static int process_all_refs(struct send_ctx *sctx,
break;
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
- btrfs_release_path(path);
if (ret < 0)
goto out;
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
btrfs_release_path(path);
@@ -3919,19 +4187,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
@@ -3943,8 +4217,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (ret < 0)
goto out;
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
out:
@@ -3983,6 +4256,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
goto out;
last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* initial readahead */
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, inode->i_mapping);
+ btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+ last_index - index + 1);
+
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_CACHE_SIZE - pg_offset);
@@ -4166,6 +4446,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
p = fs_path_alloc();
if (!p)
return -ENOMEM;
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto tlv_put_failure;
memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
@@ -4173,9 +4456,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
@@ -4716,7 +4996,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (S_ISREG(sctx->cur_inode_mode)) {
if (need_send_hole(sctx)) {
- if (sctx->cur_inode_last_extent == (u64)-1) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
ret = get_last_extent(sctx, (u64)-1);
if (ret)
goto out;
@@ -4755,18 +5037,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
ret = apply_children_dir_moves(sctx);
if (ret)
goto out;
+ /*
+ * Need to send that every time, no matter if it actually
+ * changed between the two trees as we have done changes to
+ * the inode before. If our inode is a directory and it's
+ * waiting to be moved/renamed, we will send its utimes when
+ * it's moved/renamed, therefore we don't need to do it here.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
}
- /*
- * Need to send that every time, no matter if it actually
- * changed between the two trees as we have done changes to
- * the inode before.
- */
- sctx->send_progress = sctx->cur_ino + 1;
- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
- if (ret < 0)
- goto out;
-
out:
return ret;
}
@@ -4832,6 +5115,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4876,6 +5161,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
ret = send_create_inode_if_needed(sctx);
if (ret < 0)
goto out;
@@ -5116,37 +5403,15 @@ static int full_send_tree(struct send_ctx *sctx)
struct btrfs_path *path;
struct extent_buffer *eb;
int slot;
- u64 start_ctransid;
- u64 ctransid;
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- spin_lock(&send_root->root_item_lock);
- start_ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- /*
- * Make sure the tree has not changed after re-joining. We detect this
- * by comparing start_ctransid and ctransid. They should always match.
- */
- spin_lock(&send_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
- if (ctransid != start_ctransid) {
- WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
- "send was modified in between. This is "
- "probably a bug.\n");
- ret = -EIO;
- goto out;
- }
-
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -5227,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
*/
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
- "send_in_progres unbalanced %d root %llu\n",
+ "send_in_progres unbalanced %d root %llu",
root->send_in_progress, root->root_key.objectid);
spin_unlock(&root->root_item_lock);
}
@@ -5255,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
/*
* The subvolume must remain read-only during send, protect against
- * making it RW.
+ * making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
send_root->send_in_progress++;
@@ -5315,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
}
sctx->send_root = send_root;
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started
+ */
+ if (btrfs_root_dead(sctx->send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
@@ -5332,6 +5606,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
(arg->clone_sources_count + 1));
@@ -5403,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
spin_lock(&sctx->parent_root->root_item_lock);
sctx->parent_root->send_in_progress++;
- if (!btrfs_root_readonly(sctx->parent_root)) {
+ if (!btrfs_root_readonly(sctx->parent_root) ||
+ btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
@@ -5427,7 +5703,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
+ current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
+ current->journal_info = NULL;
if (ret < 0)
goto out;
@@ -5469,6 +5747,16 @@ out:
kfree(dm);
}
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+ struct rb_node *n;
+ struct orphan_dir_info *odi;
+
+ n = rb_first(&sctx->orphan_dirs);
+ odi = rb_entry(n, struct orphan_dir_info, node);
+ free_orphan_dir_info(sctx, odi);
+ }
+
if (sort_clone_roots) {
for (i = 0; i < sctx->clone_roots_cnt; i++)
btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c02f6335689..8e16bca69c5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,8 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
static const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -383,20 +385,6 @@ static match_table_t tokens = {
{Opt_err, NULL},
};
-#define btrfs_set_and_info(root, opt, fmt, args...) \
-{ \
- if (!btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_set_opt(root->fs_info->mount_opt, opt); \
-}
-
-#define btrfs_clear_and_info(root, opt, fmt, args...) \
-{ \
- if (btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_clear_opt(root->fs_info->mount_opt, opt); \
-}
-
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -523,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
} else if (compress) {
if (!btrfs_test_opt(root, COMPRESS))
btrfs_info(root->fs_info,
- "btrfs: use %s compression\n",
+ "btrfs: use %s compression",
compress_type);
}
break;
@@ -534,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_ssd_spread:
btrfs_set_and_info(root, SSD_SPREAD,
"use spread ssd allocation scheme");
+ btrfs_set_opt(info->mount_opt, SSD);
break;
case Opt_nossd:
- btrfs_clear_and_info(root, NOSSD,
+ btrfs_set_and_info(root, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
break;
@@ -566,7 +555,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
kfree(num);
if (info->max_inline) {
- info->max_inline = max_t(u64,
+ info->max_inline = min_t(u64,
info->max_inline,
root->sectorsize);
}
@@ -592,8 +581,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
break;
case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
root->fs_info->sb->s_flags |= MS_POSIXACL;
break;
+#else
+ btrfs_err(root->fs_info,
+ "support for ACL not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
case Opt_noacl:
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
@@ -855,6 +851,7 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
+ struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -925,7 +922,13 @@ setup_root:
return dget(sb->s_root);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_DISCONNECTED;
+ spin_unlock(&dentry->d_lock);
+ }
+ return dentry;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1177,7 +1180,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return ERR_PTR(-ENOMEM);
mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
newargs);
+
+ if (PTR_RET(mnt) == -EBUSY) {
+ if (flags & MS_RDONLY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+ newargs);
+ } else {
+ int r;
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+ newargs);
+ if (IS_ERR(mnt)) {
+ kfree(newargs);
+ return ERR_CAST(mnt);
+ }
+
+ r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ if (r < 0) {
+ /* FIXME: release vfsmount mnt ??*/
+ kfree(newargs);
+ return ERR_PTR(r);
+ }
+ }
+ }
+
kfree(newargs);
+
if (IS_ERR(mnt))
return ERR_CAST(mnt);
@@ -1298,13 +1325,6 @@ error_fs_info:
return ERR_PTR(error);
}
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
- spin_lock_irq(&workers->lock);
- workers->max_workers = new_limit;
- spin_unlock_irq(&workers->lock);
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
int new_pool_size, int old_pool_size)
{
@@ -1316,21 +1336,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "resize thread pool %d -> %d",
old_pool_size, new_pool_size);
- btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
- new_pool_size);
+ btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+ new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+ new_pool_size);
}
static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1381,6 +1400,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
+ sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
ret = btrfs_parse_options(root, data);
@@ -1401,6 +1421,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
*/
+ cancel_work_sync(&fs_info->async_reclaim_work);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
@@ -1447,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
/* recover relocation */
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret)
goto restore;
@@ -1472,6 +1495,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
}
out:
+ wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
@@ -1787,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
list_for_each_entry(dev, head, dev_list) {
if (dev->missing)
continue;
+ if (!dev->name)
+ continue;
if (!first_dev || dev->devid < first_dev->devid)
first_dev = dev;
}
@@ -1881,6 +1907,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_inodes();
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups();
out:
btrfs_destroy_test_fs();
return ret;
@@ -1996,7 +2025,7 @@ static void __exit exit_btrfs_fs(void)
btrfs_hash_exit();
}
-module_init(init_btrfs_fs)
+late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 782374d8fd1..78699364f53 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
#include <linux/kobject.h>
#include <linux/bug.h>
#include <linux/genhd.h>
+#include <linux/debugfs.h>
#include "ctree.h"
#include "disk-io.h"
@@ -253,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
static ssize_t raid_bytes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
@@ -265,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
struct btrfs_block_group_cache *block_group;
- int index = kobj - sinfo->block_group_kobjs;
+ int index = to_raid_kobj(kobj)->raid_type;
u64 val = 0;
down_read(&sinfo->groups_sem);
@@ -287,7 +289,7 @@ static struct attribute *raid_attributes[] = {
static void release_raid_kobj(struct kobject *kobj)
{
- kobject_put(kobj->parent);
+ kfree(to_raid_kobj(kobj));
}
struct kobj_type btrfs_raid_ktype = {
@@ -373,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_root *root = fs_info->fs_root;
int ret;
- if (len >= BTRFS_LABEL_SIZE) {
- pr_err("BTRFS: unable to set label with more than %d bytes\n",
- BTRFS_LABEL_SIZE - 1);
+ if (len >= BTRFS_LABEL_SIZE)
return -EINVAL;
- }
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
@@ -395,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+static ssize_t btrfs_no_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ return -EPERM;
+}
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+
static struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
+ BTRFS_ATTR_PTR(nodesize),
+ BTRFS_ATTR_PTR(sectorsize),
+ BTRFS_ATTR_PTR(clone_alignment),
NULL,
};
@@ -566,20 +605,52 @@ static void init_feature_attrs(void)
}
}
-static int add_device_membership(struct btrfs_fs_info *fs_info)
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!fs_info->device_dir_kobj)
+ return -EINVAL;
+
+ if (one_device) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_info->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
{
int error = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *dev;
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
+ if (!fs_info->device_dir_kobj)
+ fs_info->device_dir_kobj = kobject_create_and_add("devices",
&fs_info->super_kobj);
+
if (!fs_info->device_dir_kobj)
return -ENOMEM;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- struct hd_struct *disk = dev->bdev->bd_part;
- struct kobject *disk_kobj = &part_to_dev(disk)->kobj;
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!dev->bdev)
+ continue;
+
+ if (one_device && one_device != dev)
+ continue;
+
+ disk = dev->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
error = sysfs_create_link(fs_info->device_dir_kobj,
disk_kobj, disk_kobj->name);
@@ -593,6 +664,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
@@ -615,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
- error = add_device_membership(fs_info);
+ error = btrfs_kobj_add_device(fs_info, NULL);
if (error)
goto failure;
@@ -636,27 +713,41 @@ failure:
return error;
}
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+ if (!btrfs_debugfs_root_dentry)
+ return -ENOMEM;
+
+ debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+ &btrfs_debugfs_test);
+#endif
+ return 0;
+}
+
int btrfs_init_sysfs(void)
{
int ret;
+
btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
if (!btrfs_kset)
return -ENOMEM;
- init_feature_attrs();
+ ret = btrfs_init_debugfs();
+ if (ret)
+ return ret;
+ init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
- if (ret) {
- kset_unregister(btrfs_kset);
- return ret;
- }
- return 0;
+ return ret;
}
void btrfs_exit_sysfs(void)
{
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d4..ac46df37504 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
#ifndef _BTRFS_SYSFS_H_
#define _BTRFS_SYSFS_H_
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
enum btrfs_feature_set {
FEAT_COMPAT,
FEAT_COMPAT_RO,
@@ -61,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 757ef00a75a..9626252ee6b 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
static struct vfsmount *test_mnt = NULL;
@@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void)
kern_unmount(test_mnt);
unregister_filesystem(&test_type);
}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+ GFP_NOFS);
+
+ if (!fs_info)
+ return fs_info;
+ fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+ GFP_NOFS);
+ if (!fs_info->fs_devices) {
+ kfree(fs_info);
+ return NULL;
+ }
+ fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+ GFP_NOFS);
+ if (!fs_info->super_copy) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ if (init_srcu_struct(&fs_info->subvol_srcu)) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info->super_copy);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->qgroup_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ mutex_init(&fs_info->qgroup_rescan_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ fs_info->running_transaction = NULL;
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_ulist = NULL;
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock(&fs_info->buffer_lock);
+restart:
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ goto restart;
+ continue;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+ free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+ btrfs_free_fs_roots(fs_info);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+ if (root->node)
+ free_extent_buffer(root->node);
+ if (root->fs_info)
+ btrfs_free_dummy_fs_info(root->fs_info);
+ kfree(root);
+}
+
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 312560a9123..fd395422448 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,13 +23,18 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+struct btrfs_root;
+
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void)
{
return 0;
}
+static inline int btrfs_test_qgroups(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 397d1f99a8e..3ae0f5b8bb8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -23,33 +23,6 @@
#include "../extent_io.h"
#include "../volumes.h"
-static struct btrfs_fs_info *alloc_dummy_fs_info(void)
-{
- struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
- GFP_NOFS);
- if (!fs_info)
- return fs_info;
- fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
- GFP_NOFS);
- if (!fs_info->fs_devices) {
- kfree(fs_info);
- return NULL;
- }
- return fs_info;
-}
-static void free_dummy_root(struct btrfs_root *root)
-{
- if (!root)
- return;
- if (root->fs_info) {
- kfree(root->fs_info->fs_devices);
- kfree(root->fs_info);
- }
- if (root->node)
- free_extent_buffer(root->node);
- kfree(root);
-}
-
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
u64 disk_len, u32 type, u8 compression, int slot)
@@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void)
* We do this since btrfs_get_extent wants to assign em->bdev to
* root->fs_info->fs_devices->latest_bdev.
*/
- root->fs_info = alloc_dummy_fs_info();
+ root->fs_info = btrfs_alloc_dummy_fs_info();
if (!root->fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
@@ -837,7 +810,7 @@ out:
if (!IS_ERR(em))
free_extent_map(em);
iput(inode);
- free_dummy_root(root);
+ btrfs_free_dummy_root(root);
return ret;
}
@@ -864,7 +837,7 @@ static int test_hole_first(void)
goto out;
}
- root->fs_info = alloc_dummy_fs_info();
+ root->fs_info = btrfs_alloc_dummy_fs_info();
if (!root->fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
@@ -934,7 +907,7 @@ out:
if (!IS_ERR(em))
free_extent_map(em);
iput(inode);
- free_dummy_root(root);
+ btrfs_free_dummy_root(root);
return ret;
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 00000000000..ec3dcb20235
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 2013 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ ins.objectid = bytenr;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+ if (ret) {
+ test_msg("Couldn't insert ref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, 1);
+ btrfs_set_extent_generation(leaf, item, 1);
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(item + 1);
+ btrfs_set_tree_block_level(leaf, block_info, 1);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+ if (ret)
+ test_msg("Failed to insert backref\n");
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Didn't find our key %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Couldn't find backref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup basic add\n");
+ ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_item(root, 4096, 4096);
+ if (ret)
+ return -EINVAL;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't remove space from the qgroup %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup multiple refs test\n");
+
+ /* We have 5 created already from the previous test */
+ ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tmp_root;
+ int ret = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Can't use bytenr 0, some things freak out
+ * *cough*backref walking code*cough*
+ */
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 5;
+ root->fs_info->fs_root = tmp_root;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 256;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to addt he root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
+
+ test_msg("Running qgroup tests\n");
+ ret = test_no_shared_qgroup(root);
+ if (ret)
+ goto out;
+ ret = test_multiple_refs(root);
+out:
+ btrfs_free_dummy_root(root);
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4..5f379affdf2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
+#include "qgroup.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -75,10 +76,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
{
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+ list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ if (is_fstree(root->objectid))
+ btrfs_unpin_free_ino(root);
+ }
+ up_write(&fs_info->commit_root_sem);
}
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -208,6 +220,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -229,18 +242,19 @@ loop:
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (root->ref_cows && root->last_trans < trans->transid) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
/*
- * see below for in_trans_setup usage rules
+ * see below for IN_TRANS_SETUP usage rules
* we have the reloc mutex held now, so there
* is only one writer in this function
*/
- root->in_trans_setup = 1;
+ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
- /* make sure readers find in_trans_setup before
+ /* make sure readers find IN_TRANS_SETUP before
* they find our root->last_trans update
*/
smp_wmb();
@@ -267,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* But, we have to set root->last_trans before we
* init the relocation root, otherwise, we trip over warnings
* in ctree.c. The solution used here is to flag ourselves
- * with root->in_trans_setup. When this is 1, we're still
+ * with root IN_TRANS_SETUP. When this is 1, we're still
* fixing up the reloc trees and everyone must wait.
*
* When this is zero, they can trust root->last_trans and fly
@@ -276,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* done before we pop in the zero below
*/
btrfs_init_reloc_root(trans, root);
- smp_wmb();
- root->in_trans_setup = 0;
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
return 0;
}
@@ -286,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
/*
- * see record_root_in_trans for comments about in_trans_setup usage
+ * see record_root_in_trans for comments about IN_TRANS_SETUP usage
* and barriers
*/
smp_rmb();
if (root->last_trans == trans->transid &&
- !root->in_trans_setup)
+ !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
mutex_lock(&root->fs_info->reloc_mutex);
@@ -353,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
if (!root->fs_info->reloc_ctl ||
- !root->ref_cows ||
+ !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -372,6 +386,9 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
bool reloc_reserved = false;
int ret;
+ /* Send isn't supposed to start transactions. */
+ ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -476,6 +493,7 @@ again:
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
may_wait_transaction(root, type)) {
+ current->journal_info = h;
btrfs_commit_transaction(h, root);
goto again;
}
@@ -682,20 +700,35 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
+ int must_run_delayed_refs = 0;
- if (--trans->use_count) {
+ if (trans->use_count > 1) {
+ trans->use_count--;
trans->block_rsv = trans->orig_rsv;
return 0;
}
- /*
- * do the qgroup accounting as early as possible
- */
- err = btrfs_delayed_refs_qgroup_accounting(trans, info);
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ trans->delayed_ref_updates = 0;
+ if (!trans->sync) {
+ must_run_delayed_refs =
+ btrfs_should_throttle_delayed_refs(trans, root);
+ cur = max_t(unsigned long, cur, 32);
+
+ /*
+ * don't make the caller wait if they are from a NOLOCK
+ * or ATTACH transaction, it will deadlock with commit
+ */
+ if (must_run_delayed_refs == 1 &&
+ (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+ must_run_delayed_refs = 2;
+ }
+
if (trans->qgroup_reserved) {
/*
* the same root has to be passed here between start_transaction
@@ -705,16 +738,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->qgroup_reserved = 0;
}
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
-
- trans->delayed_ref_updates = 0;
- if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
- cur = max_t(unsigned long, cur, 32);
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -731,17 +754,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle) {
- /*
- * We may race with somebody else here so end up having
- * to call end_transaction on ourselves again, so inc
- * our use_count.
- */
- trans->use_count++;
+ if (throttle)
return btrfs_commit_transaction(trans, root);
- } else {
+ else
wake_up_process(info->transaction_kthread);
- }
}
if (trans->type & __TRANS_FREEZABLE)
@@ -771,6 +787,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ if (must_run_delayed_refs) {
+ btrfs_async_run_delayed_refs(root, cur,
+ must_run_delayed_refs == 1);
+ }
return err;
}
@@ -925,9 +945,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
}
- if (root != root->fs_info->extent_root)
- switch_commit_root(root);
-
return 0;
}
@@ -983,15 +1000,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ if (root != fs_info->extent_root)
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
}
- down_write(&fs_info->extent_commit_sem);
- switch_commit_root(fs_info->extent_root);
- up_write(&fs_info->extent_commit_sem);
-
+ list_add_tail(&fs_info->extent_root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
return 0;
@@ -1044,15 +1062,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
/* see comments in should_cow_block() */
- root->force_cow = 0;
- smp_wmb();
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
if (root->commit_root != root->node) {
- mutex_lock(&root->fs_commit_mutex);
- switch_commit_root(root);
- btrfs_unpin_free_ino(root);
- mutex_unlock(&root->fs_commit_mutex);
-
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -1079,7 +1094,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- if (xchg(&root->defrag_running, 1))
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
return 0;
while (1) {
@@ -1102,7 +1117,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
break;
}
}
- root->defrag_running = 0;
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
return ret;
}
@@ -1166,12 +1181,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto no_free_objectid;
}
- pending->error = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (pending->error)
- goto no_free_objectid;
-
key.objectid = objectid;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1268,8 +1277,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
+ /*
+ * We need to flush delayed refs in order to make sure all of our quota
+ * operations have been done before we call btrfs_qgroup_inherit.
+ */
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
/* see comments in should_cow_block() */
- root->force_cow = 1;
+ set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
btrfs_set_root_node(new_root_item, tmp);
@@ -1578,10 +1605,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
trace_btrfs_transaction_commit(root);
- btrfs_scrub_continue(root);
-
if (current->journal_info == trans)
current->journal_info = NULL;
+ btrfs_scrub_cancel(root->fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
@@ -1592,17 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_run_delayed_items(trans, root);
- /*
- * running the delayed items may have added new refs. account
- * them now so that they hinder processing of more delayed refs
- * as little as possible.
- */
- if (ret) {
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
- return ret;
- }
-
- ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
if (ret)
return ret;
@@ -1621,7 +1636,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- return btrfs_start_delalloc_roots(fs_info, 1);
+ return btrfs_start_delalloc_roots(fs_info, 1, -1);
return 0;
}
@@ -1754,7 +1769,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
/* ->aborted might be set after the previous check, so check it */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
* the reloc mutex makes sure that we stop
@@ -1771,7 +1786,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = create_pending_snapshots(trans, root->fs_info);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1787,13 +1802,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_items(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1823,7 +1838,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1844,7 +1859,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1855,7 +1870,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = cur_trans->aborted;
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
btrfs_prepare_extent_commit(trans, root);
@@ -1864,11 +1879,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- switch_commit_root(root->fs_info->tree_root);
+ list_add_tail(&root->fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- switch_commit_root(root->fs_info->chunk_root);
+ list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
update_super_roots(root);
@@ -1891,13 +1910,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = write_ctree_super(trans, root, 0);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1940,6 +1959,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
+scrub_continue:
+ btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -1977,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
}
root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
- /*
- * Make sure root is not involved in send,
- * if we fail with first root, we return
- * directly rather than continue.
- */
- spin_lock(&root->root_item_lock);
- if (root->send_in_progress) {
- spin_unlock(&fs_info->trans_lock);
- spin_unlock(&root->root_item_lock);
- return 0;
- }
- spin_unlock(&root->root_item_lock);
-
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6ac037e9f9f..7dd558ed071 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
struct list_head pending_snapshots;
struct list_head ordered_operations;
struct list_head pending_chunks;
+ struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -68,6 +69,7 @@ struct btrfs_transaction {
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
+#define __TRANS_DUMMY (1U << 13)
#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
@@ -78,6 +80,8 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
+#define BTRFS_SEND_TRANS_STUB 1
+
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 76928ca9774..a63719cc957 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
- if (root->ref_cows == 0)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e0..9e1f2cd5e67 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,13 +20,11 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
-#include "ctree.h"
-#include "transaction.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "tree-log.h"
#include "hash.h"
/* magic values for the inode_only field in btrfs_log_inode:
@@ -136,43 +134,61 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
{
+ int index;
int ret;
- int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
if (!root->log_start_pid) {
root->log_start_pid = current->pid;
- root->log_multiple_pids = false;
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
} else if (root->log_start_pid != current->pid) {
- root->log_multiple_pids = true;
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
mutex_unlock(&root->log_mutex);
return 0;
}
- root->log_multiple_pids = false;
- root->log_start_pid = current->pid;
+
+ ret = 0;
mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree) {
+ if (!root->fs_info->log_root_tree)
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- if (ret)
- err = ret;
- }
- if (err == 0 && !root->log_root) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
+
+ if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
- err = ret;
+ goto out;
}
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+out:
mutex_unlock(&root->log_mutex);
- return err;
+ return ret;
}
/*
@@ -2359,8 +2375,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2375,36 +2391,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ if (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]))
schedule();
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ } while (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]));
- return 0;
}
static void wait_for_writer(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers)) {
+
+ while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers))
+ if (atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
}
}
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+
+ if (!error) {
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+ return;
+ }
+
+ list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ ctx->log_ret = error;
+
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
@@ -2418,7 +2461,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
@@ -2426,26 +2469,35 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
- unsigned long log_transid = 0;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
mutex_lock(&root->log_mutex);
- log_transid = root->log_transid;
- index1 = root->log_transid % 2;
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, root->log_transid);
+ wait_log_commit(trans, root, log_transid);
mutex_unlock(&root->log_mutex);
- return 0;
+ return ctx->log_ret;
}
+ ASSERT(log_transid == root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, root->log_transid - 1);
+ wait_log_commit(trans, root, log_transid - 1);
+
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
+ if (!btrfs_test_opt(root, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@@ -2456,7 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2477,6 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(root->fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2486,7 +2539,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
- smp_mb();
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
@@ -2494,9 +2546,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
+ btrfs_init_log_ctx(&root_log_ctx);
+
mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch);
atomic_inc(&log_root_tree->log_writers);
+
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
@@ -2509,13 +2568,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ if (!list_empty(&root_log_ctx.list))
+ list_del_init(&root_log_ctx.list);
+
blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
- root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2586,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = log_root_tree->log_transid % 2;
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
+ index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid);
+ root_log_ctx.log_transid);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = 0;
+ ret = root_log_ctx.log_ret;
goto out;
}
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid - 1);
+ root_log_ctx.log_transid - 1);
}
wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2649,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_transid++;
- smp_mb();
-
mutex_unlock(&log_root_tree->log_mutex);
/*
@@ -2591,6 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
goto out_wake_log_root;
}
@@ -2601,13 +2671,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
+ /*
+ * We needn't get log_mutex here because we are sure all
+ * the other tasks are blocked.
+ */
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
- smp_mb();
+ mutex_unlock(&log_root_tree->log_mutex);
+
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
+ /* See above. */
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+
+ mutex_lock(&root->log_mutex);
+ root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
- smp_mb();
+ mutex_unlock(&root->log_mutex);
+
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -2793,7 +2878,7 @@ fail:
out_unlock:
mutex_unlock(&BTRFS_I(dir)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0)
btrfs_abort_transaction(trans, root, ret);
@@ -2826,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, root, ret);
@@ -3479,7 +3564,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path)
+ struct extent_map *em, struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
@@ -3495,7 +3581,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
int extent_inserted = 0;
@@ -3579,17 +3664,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
* First check and see if our csums are on our outstanding ordered
* extents.
*/
-again:
- spin_lock_irq(&log->log_extents_lock[index]);
- list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
if (!mod_len)
break;
- if (ordered->inode != inode)
- continue;
-
if (ordered->file_offset + ordered->len <= mod_start ||
mod_start + mod_len <= ordered->file_offset)
continue;
@@ -3632,12 +3712,6 @@ again:
if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
&ordered->flags))
continue;
- atomic_inc(&ordered->refs);
- spin_unlock_irq(&log->log_extents_lock[index]);
- /*
- * we've dropped the lock, we must either break or
- * start over after this.
- */
if (ordered->csum_bytes_left) {
btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3721,11 @@ again:
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret) {
- btrfs_put_ordered_extent(ordered);
+ if (ret)
goto unlocked;
- }
}
- btrfs_put_ordered_extent(ordered);
- goto again;
}
- spin_unlock_irq(&log->log_extents_lock[index]);
unlocked:
if (!mod_len || ret)
@@ -3694,7 +3763,8 @@ unlocked:
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3752,7 +3822,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3788,6 +3858,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
+ LIST_HEAD(logged_list);
u64 last_extent = 0;
int err = 0;
int ret;
@@ -3836,7 +3907,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(log, inode);
+ btrfs_get_logged_extents(inode, &logged_list);
/*
* a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4033,8 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+ &logged_list);
if (ret) {
err = ret;
goto out_unlock;
@@ -3987,8 +4059,10 @@ log_extents:
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
- if (err)
- btrfs_free_logged_extents(log, log->log_transid);
+ if (unlikely(err))
+ btrfs_put_logged_extents(&logged_list);
+ else
+ btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -4048,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
* make sure any commits to the log are forced
* to be full commits
*/
- root->fs_info->last_trans_log_full_commit =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
break;
}
@@ -4079,7 +4152,8 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+ struct dentry *parent, int exists_only,
+ struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
@@ -4094,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ /*
+ * The prev transaction commit doesn't complete, we need do
+ * full commit by ourselves.
+ */
if (root->fs_info->last_trans_log_full_commit >
root->fs_info->last_trans_committed) {
ret = 1;
@@ -4116,9 +4194,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = start_log_trans(trans, root);
+ ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_trans;
+ goto end_no_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
if (ret)
@@ -4163,9 +4241,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
}
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -4178,12 +4259,14 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ 0, ctx);
dput(parent);
return ret;
@@ -4420,6 +4503,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a7..7f5b41bd537 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,17 +19,47 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+#include "ctree.h"
+#include "transaction.h"
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+ struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ INIT_LIST_HEAD(&ctx->list);
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+ trans->transid;
+}
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f8..6cb82f62cb7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,7 @@
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -415,7 +416,8 @@ loop_lock:
device->running_pending = 1;
spin_unlock(&device->io_lock);
- btrfs_requeue_work(&device->work);
+ btrfs_queue_work(fs_info->submit_workers,
+ &device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
@@ -447,6 +449,14 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1 - first time device is seen
+ * 0 - device already known
+ * < 0 - error
+ */
static noinline int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -454,6 +464,7 @@ static noinline int device_list_add(const char *path,
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
+ int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
fs_devices = find_fsid(disk_super->fsid);
@@ -494,6 +505,7 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
+ ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
name = rcu_string_strdup(path, GFP_NOFS);
@@ -512,7 +524,8 @@ static noinline int device_list_add(const char *path,
fs_devices->latest_trans = found_transid;
}
*fs_devices_ret = fs_devices;
- return 0;
+
+ return ret;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -542,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
- name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
- if (!name) {
- kfree(device);
- goto error;
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
}
- rcu_assign_pointer(device->name, name);
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
@@ -909,17 +924,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
- if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
- printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
- } else {
- printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
- }
-
- printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
-
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+ if (ret > 0) {
+ if (disk_super->label[0]) {
+ if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+ } else {
+ printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+ }
+
+ printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+ ret = 0;
+ }
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
@@ -1438,6 +1455,22 @@ out:
return ret;
}
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+ struct file *filp;
+
+ filp = filp_open(path_name, O_RDWR, 0);
+ if (!filp)
+ return;
+ file_update_time(filp);
+ filp_close(filp, NULL);
+ return;
+}
+
static int btrfs_rm_dev_item(struct btrfs_root *root,
struct btrfs_device *device)
{
@@ -1647,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
- if (device->bdev)
+ if (device->bdev) {
device->fs_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_kobj_rm_device(root->fs_info, device);
+ }
call_rcu(&device->rcu, free_device);
@@ -1660,11 +1696,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
- if (fs_devices->seed == cur_devices)
+ if (fs_devices->seed == cur_devices) {
+ fs_devices->seed = cur_devices->seed;
break;
+ }
fs_devices = fs_devices->seed;
}
- fs_devices->seed = cur_devices->seed;
cur_devices->seed = NULL;
lock_chunks(root);
__btrfs_close_devices(cur_devices);
@@ -1680,20 +1717,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* remove it from the devices list and zero out the old super
*/
if (clear_super && disk_super) {
+ u64 bytenr;
+ int i;
+
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
+
+ /* clear the mirror copies of super block on the disk
+ * being removed, 0th copy is been taken care above and
+ * the below would take of the rest
+ */
+ for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ i_size_read(bdev->bd_inode))
+ break;
+
+ brelse(bh);
+ bh = __bread(bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh)
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ continue;
+ }
+ memset(&disk_super->magic, 0,
+ sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
}
ret = 0;
- /* Notify udev that device has changed */
- if (bdev)
+ if (bdev) {
+ /* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+ }
+
error_brelse:
brelse(bh);
if (bdev)
@@ -1869,7 +1941,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
- fs_devices->total_devices = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2078,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
+
+ /* add sysfs device entry */
+ btrfs_kobj_add_device(root->fs_info, device);
+
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = init_first_rw_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2091,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+
+ /* Sprouting would change fsid of the mounted root,
+ * so rename the fsid on the sysfs
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+ root->fs_info->fsid);
+ if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
if (ret) {
@@ -2132,12 +2216,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_commit_transaction(trans, root);
}
+ /* Update ctime/mtime for libblkid */
+ update_dev_time(device_path);
return ret;
error_trans:
unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
+ btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2476,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- kfree(map);
- em->bdev = NULL;
-
/* once for the tree */
free_extent_map(em);
/* once for us */
@@ -2908,6 +2992,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /*
+ * limited by count, must be the last filter
+ */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+ if (bargs->limit == 0)
+ return 0;
+ else
+ bargs->limit--;
+ }
+
return 1;
}
@@ -2930,6 +3024,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ u64 limit_data = bctl->data.limit;
+ u64 limit_meta = bctl->meta.limit;
+ u64 limit_sys = bctl->sys.limit;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -2968,6 +3065,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
+ if (!counting) {
+ bctl->data.limit = limit_data;
+ bctl->meta.limit = limit_meta;
+ bctl->sys.limit = limit_sys;
+ }
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -3867,7 +3969,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u8 *ptr;
array_size = btrfs_super_sys_array_size(super_copy);
- if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ if (array_size + item_size + sizeof(disk_key)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
ptr = super_copy->sys_chunk_array + array_size;
@@ -3972,6 +4075,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
+ - sizeof(struct btrfs_item) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 start,
u64 type)
@@ -4021,6 +4134,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
@@ -4028,11 +4143,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
} else {
- btrfs_err(info, "invalid chunk type 0x%llx requested\n",
+ btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
BUG_ON(1);
}
@@ -4199,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
@@ -4244,7 +4365,6 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
@@ -4280,7 +4400,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
if (em->start != chunk_offset || em->len != chunk_size) {
btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
- " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
chunk_size, em->start, em->len);
free_extent_map(em);
return -EINVAL;
@@ -4456,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -4482,14 +4601,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* and exit, so return 1 so the callers don't try to use other copies.
*/
if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
+ btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
logical+len);
return 1;
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
- "%Lu-%Lu\n", logical, logical+len, em->start,
+ "%Lu-%Lu", logical, logical+len, em->start,
em->start + em->len);
free_extent_map(em);
return 1;
@@ -4670,7 +4789,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
- "found %Lu-%Lu\n", logical, em->start,
+ "found %Lu-%Lu", logical, em->start,
em->start + em->len);
free_extent_map(em);
return -EINVAL;
@@ -5260,9 +5379,19 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ kfree(bbio);
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
@@ -5270,7 +5399,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5292,18 +5420,14 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (bio == bbio->orig_bio)
is_orig_bio = 1;
+ btrfs_bio_counter_dec(bbio->fs_info);
+
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
bio = bbio->orig_bio;
}
- /*
- * We have original bio now. So increment bi_remaining to
- * account for it in endio
- */
- atomic_inc(&bio->bi_remaining);
-
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5320,21 +5444,13 @@ static void btrfs_end_bio(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(bbio);
- bio_endio(bio, err);
+ btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-struct async_sched {
- struct bio *bio;
- int rw;
- struct btrfs_fs_info *info;
- struct btrfs_work work;
-};
-
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
@@ -5391,8 +5507,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
spin_unlock(&device->io_lock);
if (should_queue)
- btrfs_queue_worker(&root->fs_info->submit_workers,
- &device->work);
+ btrfs_queue_work(root->fs_info->submit_workers,
+ &device->work);
}
static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5563,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
}
#endif
bio->bi_bdev = dev->bdev;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+
if (async)
btrfs_schedule_bio(root, dev, rw, bio);
else
@@ -5489,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- kfree(bbio);
- bio_endio(bio, -EIO);
+
+ btrfs_end_bbio(bbio, bio, -EIO);
}
}
@@ -5515,28 +5637,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
length = bio->bi_iter.bi_size;
map_length = length;
+ btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num, &raid_map);
- if (ret) /* -ENOMEM */
+ if (ret) {
+ btrfs_bio_counter_dec(root->fs_info);
return ret;
+ }
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
+ bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if (raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- return raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio,
+ raid_map, map_length);
} else {
- return raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
- mirror_num);
+ ret = raid56_parity_recover(root, bio, bbio,
+ raid_map, map_length,
+ mirror_num);
}
+ /*
+ * FIXME, replace dosen't support raid56 yet, please fix
+ * it in the future.
+ */
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
}
if (map_length < length) {
@@ -5571,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
@@ -5578,6 +5711,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
async_submit);
dev_nr++;
}
+ btrfs_bio_counter_dec(root->fs_info);
return 0;
}
@@ -5666,7 +5800,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- dev->work.func = pending_bios_fn;
+ btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
return dev;
}
@@ -5711,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5735,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5743,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -6035,10 +6168,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
- device->dev_root = fs_info->dev_root;
- mutex_unlock(&fs_devices->device_list_mutex);
+ while (fs_devices) {
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ fs_devices = fs_devices->seed;
+ }
}
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b37..2aaa00c4781 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,10 +190,14 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_bio {
atomic_t stripes_pending;
+ struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
+ unsigned long flags;
void *private;
atomic_t error;
int max_errors;
@@ -254,6 +258,7 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8e57191950c..b67d8fc8127 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
printk(KERN_WARNING "BTRFS: deflateInit failed\n");
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws,
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
zlib_deflateEnd(&workspace->def_strm);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (workspace->def_strm.total_in > 8192 &&
workspace->def_strm.total_in <
workspace->def_strm.total_out) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
/* we need another page for writing out. Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
zlib_deflateEnd(&workspace->def_strm);
if (ret != Z_STREAM_END) {
- ret = -1;
+ ret = -EIO;
goto out;
}
if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
- return -1;
+ return -EIO;
}
while (workspace->inf_strm.total_in < srclen) {
ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
}
if (ret != Z_STREAM_END)
- ret = -1;
+ ret = -EIO;
else
ret = 0;
done:
@@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
- return -1;
+ return -EIO;
}
while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
total_out = workspace->inf_strm.total_out;
if (total_out == buf_start) {
- ret = -1;
+ ret = -EIO;
break;
}
@@ -382,7 +382,7 @@ next:
}
if (ret != Z_STREAM_END && bytes_left != 0)
- ret = -1;
+ ret = -EIO;
else
ret = 0;