diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
| -rw-r--r-- | fs/btrfs/extent_io.c | 3107 |
1 files changed, 2053 insertions, 1054 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fcf77e1ded4..a389820d158 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,7 +4,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -14,21 +13,94 @@ #include <linux/cleancache.h> #include "extent_io.h" #include "extent_map.h" -#include "compat.h" #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" +#include "rcu-string.h" +#include "backref.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; +#ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); -#define LEAK_DEBUG 0 -#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); + +static inline +void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(new, head); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_del(struct list_head *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(entry); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_check(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + state->start, state->end, state->state, state->tree, + atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " + "refs %d\n", + eb->start, eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, u64 start, u64 end) +{ + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + + inode = tree->mapping->host; + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, btrfs_ino(inode), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add(new, head) do {} while (0) +#define btrfs_leak_debug_del(entry) do {} while (0) +#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -43,6 +115,7 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -53,75 +126,83 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + if (!tree->mapping) + return NULL; + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + return 0; +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } void extent_io_exit(void) { - struct extent_state *state; - struct extent_buffer *eb; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - - } + btrfs_leak_debug_check(); - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping) { tree->state = RB_ROOT; - INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#if LEAK_DEBUG - unsigned long flags; -#endif state = kmem_cache_alloc(extent_state_cache, mask); if (!state) @@ -129,13 +210,10 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&state->leak_list, &states); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -144,26 +222,31 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#if LEAK_DEBUG - unsigned long flags; -#endif WARN_ON(state->tree); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_del(&state->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&state->leak_list); + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) { - struct rb_node **p = &root->rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct tree_entry *entry; + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + + p = search_start ? &search_start : &root->rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -176,36 +259,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); +do_insert: rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; + struct rb_node **n = &root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); prev_entry = entry; if (offset < entry->start) - n = n->rb_left; + n = &(*n)->rb_left; else if (offset > entry->end) - n = n->rb_right; + n = &(*n)->rb_right; else - return n; + return *n; } + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + if (prev_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { @@ -227,18 +317,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, return NULL; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_node *prev = NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL); + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); if (!ret) return prev; return ret; } +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -292,21 +391,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, int *bits); + struct extent_state *state, unsigned long *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -320,29 +419,27 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int *bits) + struct rb_node ***p, + struct rb_node **parent, + unsigned long *bits) { struct rb_node *node; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", - (unsigned long long)end, - (unsigned long long)start); - WARN_ON(1); - } + if (end < start) + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", + end, start); state->start = start; state->end = end; set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node); + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " - "%llu %llu\n", (unsigned long long)found->start, - (unsigned long long)found->end, - (unsigned long long)start, (unsigned long long)end); + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " + "%llu %llu\n", + found->start, found->end, start, end); return -EEXIST; } state->tree = tree; @@ -383,7 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -392,20 +490,28 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long *bits, int wake) { - int bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret = state->state & bits_to_clear; + struct extent_state *next; + unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -417,6 +523,7 @@ static int clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { + next = next_state(state); if (state->tree) { rb_erase(&state->rb_node, &tree->state); state->tree = NULL; @@ -426,8 +533,9 @@ static int clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } - return ret; + return next; } static struct extent_state * @@ -439,6 +547,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + /* * clear some bits on a range in the tree. This may require splitting * or inserting elements in the tree, so the gfp mask is used to @@ -449,24 +564,26 @@ alloc_extent_state_atomic(struct extent_state *prealloc) * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, + unsigned long bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; + btrfs_debug_check_extent_io_range(tree, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + if (delete) bits |= ~EXTENT_CTLBITS; bits |= EXTENT_FIRST_DELALLOC; @@ -513,6 +630,12 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + /* * | ---- desired range ---- | * | state | or @@ -533,15 +656,15 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, &bits, wake); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; + state = clear_state_bit(tree, state, &bits, wake); + goto next; } goto search_again; } @@ -555,31 +678,25 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - - set |= clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start <= end && state && !need_resched()) + goto hit_next; goto search_again; out: @@ -587,7 +704,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) @@ -598,8 +715,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -609,7 +726,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -617,11 +733,14 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits) { struct extent_state *state; struct rb_node *node; + btrfs_debug_check_extent_io_range(tree, start, end); + spin_lock(&tree->lock); again: while (1) { @@ -630,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -650,18 +770,20 @@ again: if (start > end) break; - cond_resched_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } } out: spin_unlock(&tree->lock); - return 0; } static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int *bits) + unsigned long *bits) { - int bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -682,15 +804,6 @@ static void cache_state(struct extent_state *state, } } -static void uncache_state(struct extent_state **cached_ptr) -{ - if (cached_ptr && (*cached_ptr)) { - struct extent_state *state = *cached_ptr; - *cached_ptr = NULL; - free_extent_state(state); - } -} - /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -702,17 +815,23 @@ static void uncache_state(struct extent_state **cached_ptr) * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, unsigned long exclusive_bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree, start, end); + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -733,13 +852,17 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, &bits); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -754,7 +877,6 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; @@ -762,20 +884,15 @@ hit_next: } set_state_bits(tree, state, &bits); - cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -805,7 +922,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -816,6 +935,10 @@ hit_next: if (last_end == (u64)-1) goto out; start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -841,13 +964,10 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -869,7 +989,8 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); @@ -896,13 +1017,24 @@ search_again: goto again; } +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, u64 * failed_start, + struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + /** - * convert_extent - convert all bits in a given range from one bit to another + * convert_extent_bit - convert all bits in a given range from one bit to + * another * @tree: the io tree to search * @start: the start offset in bytes * @end: the end offset in bytes (inclusive) * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist @@ -912,15 +1044,20 @@ search_again: * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask) + unsigned long bits, unsigned long clear_bits, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree, start, end); + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -929,20 +1066,32 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, &bits); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -957,23 +1106,15 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -1000,17 +1141,21 @@ hit_next: goto out; } err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; if (err) goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -1039,13 +1184,10 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + NULL, NULL, &bits); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1064,12 +1206,12 @@ hit_next: } err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } @@ -1096,19 +1238,19 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1118,7 +1260,15 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + NULL, cached_state, mask); +} + +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -1132,20 +1282,19 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - NULL, cached_state, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, cached_state, mask); @@ -1156,42 +1305,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + unsigned long bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1204,10 +1351,43 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); +} + +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; } /* @@ -1221,7 +1401,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1233,8 +1413,9 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' */ -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits) +static struct extent_state * +find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, unsigned long bits) { struct rb_node *node; struct extent_state *state; @@ -1265,21 +1446,45 @@ out: * returned if we find something, and *start_ret and *end_ret are * set to reflect the state struct that was found. * - * If nothing was found, 1 is returned, < 0 on error + * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) + u64 *start_ret, u64 *end_ret, unsigned long bits, + struct extent_state **cached_state) { struct extent_state *state; + struct rb_node *n; int ret = 1; spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && state->tree) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + state = find_first_extent_bit_state(tree, start, bits); +got_it: if (state) { + cache_state(state, cached_state); *start_ret = state->start; *end_ret = state->end; ret = 0; } +out: spin_unlock(&tree->lock); return ret; } @@ -1333,20 +1538,20 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *end = state->end; cur_start = state->end + 1; node = rb_next(node); - if (!node) - break; total_bytes += state->end - state->start + 1; if (total_bytes >= max_bytes) break; + if (!node) + break; } out: spin_unlock(&tree->lock); return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1356,7 +1561,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1371,7 +1576,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1442,11 +1646,10 @@ done: * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, - u64 *start, u64 *end, - u64 max_bytes) +STATIC u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) { u64 delalloc_start; u64 delalloc_end; @@ -1465,7 +1668,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return found; + return 0; } /* @@ -1478,10 +1681,9 @@ again: /* * make sure to limit the number of pages we try to lock down - * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, @@ -1491,9 +1693,9 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; + max_bytes = PAGE_CACHE_SIZE; loops = 1; goto again; } else { @@ -1501,11 +1703,10 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1525,31 +1726,21 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op) +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long clear_bits, + unsigned long page_ops) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - int clear_bits = 0; - - if (op & EXTENT_CLEAR_UNLOCK) - clear_bits |= EXTENT_LOCKED; - if (op & EXTENT_CLEAR_DIRTY) - clear_bits |= EXTENT_DIRTY; - - if (op & EXTENT_CLEAR_DELALLOC) - clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | - EXTENT_SET_PRIVATE2))) + if (page_ops == 0) return 0; while (nr_pages > 0) { @@ -1558,20 +1749,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (op & EXTENT_SET_PRIVATE2) + if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (op & EXTENT_CLEAR_DIRTY) + if (page_ops & PAGE_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (op & EXTENT_SET_WRITEBACK) + if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); - if (op & EXTENT_END_WRITEBACK) + if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); - if (op & EXTENT_CLEAR_UNLOCK_PAGE) + if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1598,10 +1789,8 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 last = 0; int found = 0; - if (search_end <= cur_start) { - WARN_ON(1); + if (WARN_ON(search_end <= cur_start)) return 0; - } spin_lock(&tree->lock); if (cur_start == 0 && bits == EXTENT_DIRTY) { @@ -1648,7 +1837,7 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; struct extent_state *state; @@ -1709,7 +1898,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached) + unsigned long bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -1762,39 +1951,12 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; -} - -/* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); - return 0; -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); - return 0; } /* @@ -1830,56 +1992,52 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, if (ret) err = ret; - if (did_repair) { - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); - if (ret && !err) - err = ret; - } + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; kfree(rec); return err; } -static void repair_io_failure_callback(struct bio *bio, int err) -{ - complete(bio->bi_private); -} - /* * this bypasses the standard btrfs submit functions deliberately, as * the standard behavior is to write all copies in a raid setup. here we only * want to write the one bad copy. so we do the mapping for ourselves and issue * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which * actually prevents the read that triggered the error from finishing. * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num) { struct bio *bio; struct btrfs_device *dev; - DECLARE_COMPLETION_ONSTACK(compl); u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - bio = bio_alloc(GFP_NOFS, 1); + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_private = &compl; - bio->bi_end_io = repair_io_failure_callback; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; map_length = length; - ret = btrfs_map_block(map_tree, WRITE, logical, + ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { bio_put(bio); @@ -1887,7 +2045,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } BUG_ON(mirror_num != bbio->mirror_num); sector = bbio->stripes[mirror_num-1].physical >> 9; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; kfree(bbio); if (!dev || !dev->bdev || !dev->writeable) { @@ -1895,24 +2053,46 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start-page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); + bio_add_page(bio, page, length, start - page_offset(page)); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " - "sector %llu)\n", page->mapping->host->i_ino, start, - dev->name, sector); + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -1922,12 +2102,12 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; int did_repair = 0; int ret; - struct inode *inode = page->mapping->host; private = 0; ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, @@ -1950,6 +2130,8 @@ static int clean_io_failure(u64 start, struct page *page) did_repair = 1; goto out; } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; spin_lock(&BTRFS_I(inode)->io_tree.lock); state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, @@ -1957,16 +2139,17 @@ static int clean_io_failure(u64 start, struct page *page) EXTENT_LOCKED); spin_unlock(&BTRFS_I(inode)->io_tree.lock); - if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); + if (state && state->start <= failrec->start && + state->end >= failrec->start + failrec->len - 1) { + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); if (num_copies > 1) { - ret = repair_io_failure(map_tree, start, failrec->len, + ret = repair_io_failure(fs_info, start, failrec->len, failrec->logical, page, failrec->failed_mirror); did_repair = !ret; } + ret = 0; } out: @@ -1984,9 +2167,9 @@ out: * needed */ -static int bio_readpage_error(struct bio *failed_bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state) +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) { struct io_failure_record *failrec = NULL; u64 private; @@ -1996,6 +2179,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; int num_copies; int ret; int read_mode; @@ -2022,13 +2207,13 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, return -EIO; } - if (em->start > start || em->start + em->len < start) { + if (em->start > start || em->start + em->len <= start) { free_extent_map(em); em = NULL; } read_unlock(&em_tree->lock); - if (!em || IS_ERR(em)) { + if (!em) { kfree(failrec); return -EIO; } @@ -2071,32 +2256,20 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * clean_io_failure() clean all those errors at once. */ } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies == 1) { /* * we only have a single copy of the data, so don't bother with * all the retry and error correction code that follows. no * matter what the error is, it is very likely to persist. */ - pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " - "state=%p, num_copies=%d, next_mirror %d, " - "failed_mirror %d\n", state, num_copies, - failrec->this_mirror, failed_mirror); + pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; } - if (!state) { - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&tree->lock); - } - /* * there are two premises: * a) deliver good data to the caller @@ -2133,20 +2306,35 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, read_mode = READ_SYNC; } - if (!state || failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " - "next_mirror %d, failed_mirror %d\n", state, + if (failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + if (!bio) { + free_io_failure(inode, failrec, 0); + return -EIO; + } bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; + bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; + + btrfs_failed_bio = btrfs_io_bio(failed_bio); + if (btrfs_failed_bio->csum) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = btrfs_bio->csum_inline; + phy_offset >>= inode->i_sb->s_blocksize_bits; + phy_offset *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + csum_size); + } bio_add_page(bio, page, failrec->len, start - page_offset(page)); @@ -2154,13 +2342,38 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate) { + ClearPageUptodate(page); + SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2172,61 +2385,55 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; + struct bio_vec *bvec; u64 start; u64 end; - int whole_page; - int ret; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); } - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); - } while (bvec >= bio->bi_io_vec); + end_page_writeback(page); + } bio_put(bio); } +static void +endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, + int uptodate) +{ + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + if (uptodate && tree->track_uptodate) + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2240,62 +2447,74 @@ static void end_bio_extent_writepage(struct bio *bio, int err) */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; + u64 offset = 0; u64 start; u64 end; - int whole_page; + u64 len; + u64 extent_start = 0; + u64 extent_len = 0; + int mirror; int ret; + int i; if (err) uptodate = 0; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - struct extent_state *cached = NULL; - struct extent_state *state; - - pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " - "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, - (long int)bio->bi_bdev); - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); - if (state && state->start == start) { - /* - * take a reference on the state, unlock will drop - * the ref - */ - cache_state(state, &cached); + struct inode *inode = page->mapping->host; + + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, + io_bio->mirror_num); + tree = &BTRFS_I(inode)->io_tree; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); } - spin_unlock(&tree->lock); - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end, - state); + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; + len = bvec->bv_len; + + mirror = io_bio->mirror_num; + if (likely(uptodate && tree->ops && + tree->ops->readpage_end_io_hook)) { + ret = tree->ops->readpage_end_io_hook(io_bio, offset, + page, start, end, + mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; - failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (likely(uptodate)) + goto readpage_ok; + + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2306,77 +2525,121 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; - uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } - - if (uptodate) { - set_extent_uptodate(tree, start, end, &cached, - GFP_ATOMIC); +readpage_ok: + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Zero out the end if this page straddles i_size */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); } - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); + unlock_page(page); + offset += len; + + if (unlikely(!uptodate)) { + if (extent_len) { + endio_readpage_release_extent(tree, + extent_start, + extent_len, 1); + extent_start = 0; + extent_len = 0; } - unlock_page(page); + endio_readpage_release_extent(tree, start, + end - start + 1, 0); + } else if (!extent_len) { + extent_start = start; + extent_len = end + 1 - start; + } else if (extent_start + extent_len == start) { + extent_len += end + 1 - start; } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + endio_readpage_release_extent(tree, extent_start, + extent_len, uptodate); + extent_start = start; + extent_len = end + 1 - start; } - } while (bvec <= bvec_end); + } + if (extent_len) + endio_readpage_release_extent(tree, extent_start, extent_len, + uptodate); + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { + struct btrfs_io_bio *btrfs_bio; struct bio *bio; - bio = bio_alloc(gfp_flags, nr_vecs); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } } if (bio) { - bio->bi_size = 0; bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; } return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} + + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + if (bio) { + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; +} + + +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2384,7 +2647,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, struct extent_io_tree *tree = bio->bi_private; u64 start; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; @@ -2402,6 +2665,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, return ret; } +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2424,18 +2700,17 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (old_compressed) - contig = bio->bi_sector == sector; + contig = bio->bi_iter.bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -2462,6 +2737,18 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } +static void attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); + } +} + void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { @@ -2471,25 +2758,48 @@ void set_page_extent_mapped(struct page *page) } } -static void set_page_extent_head(struct page *page, unsigned long len) +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, get_extent_t *get_extent, + struct extent_map **em_cached) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); -} + struct extent_map *em; + if (em_cached && *em_cached) { + em = *em_cached; + if (extent_map_in_tree(em) && start >= em->start && + start < extent_map_end(em)) { + atomic_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = get_extent(inode, page, pg_offset, start, len, 0); + if (em_cached && !IS_ERR_OR_NULL(em)) { + BUG_ON(*em_cached); + atomic_inc(&em->refs); + *em_cached = em; + } + return em; +} /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags) +static int __do_readpage(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -2500,68 +2810,64 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - struct btrfs_ordered_extent *ordered; int ret; int nr = 0; + int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; + unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; set_page_extent_mapped(page); + end = page_end; if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); + unlock_extent(tree, start, end); goto out; } } - end = page_end; - while (1) { - lock_extent(tree, start, end, GFP_NOFS); - ordered = btrfs_lookup_ordered_extent(inode, start); - if (!ordered) - break; - unlock_extent(tree, start, end, GFP_NOFS); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); if (zero_offset) { iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; iosize = PAGE_CACHE_SIZE - pg_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (!parent_locked) + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } - em = get_extent(inode, page, pg_offset, cur, - end - cur + 1, 0); + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2569,14 +2875,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag = EXTENT_BIO_COMPRESSED; + this_bio_flag |= EXTENT_BIO_COMPRESSED; extent_set_compress_type(&this_bio_flag, em->compress_type); } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2596,10 +2902,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -2613,7 +2919,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2623,31 +2930,28 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; } - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + pnr -= page->index; + ret = submit_extent_page(rw, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + if (!ret) { nr++; *bio_flags = this_bio_flag; - } - if (ret) + } else { SetPageError(page); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); + } cur = cur + iosize; pg_offset += iosize; } @@ -2660,6 +2964,104 @@ out: return 0; } +static inline void __do_contiguous_readpages(struct extent_io_tree *tree, + struct page *pages[], int nr_pages, + u64 start, u64 end, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode; + struct btrfs_ordered_extent *ordered; + int index; + + inode = pages[0]->mapping->host; + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + for (index = 0; index < nr_pages; index++) { + __do_readpage(tree, pages[index], get_extent, em_cached, bio, + mirror_num, bio_flags, rw); + page_cache_release(pages[index]); + } +} + +static void __extent_readpages(struct extent_io_tree *tree, + struct page *pages[], + int nr_pages, get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + u64 start = 0; + u64 end = 0; + u64 page_start; + int index; + int first_index = 0; + + for (index = 0; index < nr_pages; index++) { + page_start = page_offset(pages[index]); + if (!end) { + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } else if (end + 1 == page_start) { + end += PAGE_CACHE_SIZE; + } else { + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, + bio, mirror_num, bio_flags, + rw); + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } + } + + if (end) + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, bio, + mirror_num, bio_flags, rw); +} + +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + struct btrfs_ordered_extent *ordered; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret; + + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + bio_flags, rw); + return ret; +} + int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num) { @@ -2668,7 +3070,21 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags); + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; + int ret; + + ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, + &bio_flags, READ); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -2685,144 +3101,143 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 delalloc_start; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) +{ + struct extent_io_tree *tree = epd->tree; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage, KM_USER0); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -2834,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2844,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2854,14 +3270,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = min(em_end - cur, end - cur + 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -2896,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -2913,14 +3324,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", - page->index, (unsigned long long)cur, - (unsigned long long)end); + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } ret = submit_extent_page(write_flags, tree, page, @@ -2936,20 +3346,391 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: + return 0; +} - /* drop our reference on any cached states */ - free_extent_state(cached_state); +static int eb_wait(void *word) +{ + io_schedule(); return 0; } +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); + ret = 1; + } else { + spin_unlock(&eb->refs_lock); + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + struct bio_vec *bvec; + struct extent_buffer *eb; + int i, done; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } + + bio_put(bio); +} + +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + u64 offset = eb->start; + unsigned long i, num_pages; + unsigned long bio_flags = 0; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int ret = 0; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + continue; + } + + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); + continue; + } + + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + continue; + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -2971,8 +3752,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { + struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -2981,6 +3764,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int scanned = 0; int tag; + /* + * We have to hold onto the inode so that ordered extents can do their + * work when the IO finishes. The alternative to this is failing to add + * an ordered extent if the igrab() fails there and that is a huge pain + * to deal with, so instead just hold onto the inode throughout the + * writepages operation. If it fails here we are freeing up the inode + * anyway and we'd rather not waste our time writing out stuff that is + * going to be truncated anyway. + */ + if (!igrab(inode)) + return 0; + pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -3013,15 +3808,9 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && - tree->ops->write_cache_pages_lock_hook) { - tree->ops->write_cache_pages_lock_hook(page, - data, flush_fn); - } else { - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); } if (unlikely(page->mapping != mapping)) { @@ -3053,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -3066,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -3075,16 +3864,21 @@ retry: index = 0; goto retry; } - return ret; + btrfs_add_delayed_iput(inode); + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -3106,6 +3900,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -3130,6 +3925,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -3169,6 +3965,7 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = extent_write_cache_pages(tree, mapping, wbc, @@ -3186,22 +3983,39 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; unsigned long bio_flags = 0; + struct page *pagepool[16]; + struct page *page; + struct extent_map *em_cached = NULL; + int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { - __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); + page_cache_release(page); + continue; } - page_cache_release(page); + + pagepool[nr++] = page; + if (nr < ARRAY_SIZE(pagepool)) + continue; + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + nr = 0; } + if (nr) + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + + if (em_cached) + free_extent_map(em_cached); + BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -3214,15 +4028,15 @@ int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -3236,11 +4050,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +static int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, + struct page *page, gfp_t mask) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; int ret = 1; @@ -3279,7 +4093,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, gfp_t mask) { struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if ((mask & __GFP_WAIT) && @@ -3289,7 +4103,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } @@ -3333,11 +4147,11 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (offset >= last) return NULL; - while(1) { + while (1) { len = last - offset; if (len == 0) break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); + len = ALIGN(len, sectorsize); em = get_extent(inode, NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; @@ -3357,6 +4171,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) +{ + unsigned long cnt = *((unsigned long *)ctx); + + cnt++; + *((unsigned long *)ctx) = cnt; + + /* Now we're sure that the extent is shared. */ + if (cnt > 1) + return 1; + return 0; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -3373,12 +4200,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0; u64 em_len = 0; u64 em_end = 0; - unsigned long emflags; if (len == 0) return -EINVAL; @@ -3403,8 +4228,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } WARN_ON(!ret); path->slots[0]--; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); @@ -3423,7 +4246,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last = found_key.offset; last_for_get_extent = last + 1; } - btrfs_free_path(path); + btrfs_release_path(path); /* * we might have some extents allocated but more delalloc past those @@ -3435,8 +4258,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); @@ -3448,7 +4271,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - u64 offset_in_extent; + u64 offset_in_extent = 0; /* break if the extent we found is outside the range */ if (em->start >= max || extent_map_end(em) < off) @@ -3464,12 +4287,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * record the offset from the start of the extent - * for adjusting the disk offset below + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. */ - offset_in_extent = em_start - em->start; + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - emflags = em->flags; disko = 0; flags = 0; @@ -3490,7 +4315,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { + unsigned long ref_cnt = 0; + disko = em->block_start + offset_in_extent; + + /* + * As btrfs supports shared space, this information + * can be exported to userspace tools via + * flag FIEMAP_EXTENT_SHARED. + */ + ret = iterate_inodes_from_logical( + em->block_start, + BTRFS_I(inode)->root->fs_info, + path, count_ext_ref, &ref_cnt); + if (ret < 0 && ret != -ENOENT) + goto out_free; + + if (ref_cnt > 1) + flags |= FIEMAP_EXTENT_SHARED; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -3522,57 +4364,101 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + btrfs_free_path(path); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } -inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) +static void __free_extent_buffer(struct extent_buffer *eb) { - struct page *p; - struct address_space *mapping; + btrfs_leak_debug_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); +} - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; +int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + unsigned long num_pages; + struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + BUG_ON(extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; + if (start_idx >= index) + return; - return p; + do { + index--; + page = extent_buffer_page(eb, index); + if (page && mapped) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + } + if (page) { + /* One for when we alloced the page */ + page_cache_release(page); + } + } while (index != start_idx); } -inline unsigned long num_extent_pages(u64 start, u64 len) +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); } -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len, gfp_t mask) { struct extent_buffer *eb = NULL; -#if LEAK_DEBUG - unsigned long flags; -#endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); if (eb == NULL) return NULL; eb->start = start; eb->len = len; + eb->fs_info = fs_info; + eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3584,63 +4470,194 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&eb->leak_list, &buffers); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&eb->leak_list, &buffers); + + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_NOFS); + if (!p) { + btrfs_release_extent_buffer(new); + return NULL; + } + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; } -/* - * Helper for releasing extent buffer page. - */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) { - unsigned long index; - struct page *page; + struct extent_buffer *eb; + unsigned long num_pages = num_extent_pages(0, len); + unsigned long i; - if (!eb->first_page) - return; + eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + if (!eb) + return NULL; - index = num_extent_pages(eb->start, eb->len); - if (start_idx >= index) + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_NOFS); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (; i > 0; i--) + __free_page(eb->pages[i - 1]); + __free_extent_buffer(eb); + return NULL; +} + +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + int refs; + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) return; - do { - index--; - page = extent_buffer_page(eb, index); - if (page) - page_cache_release(page); - } while (index != start_idx); + spin_lock(&eb->refs_lock); + if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_inc(&eb->refs); + spin_unlock(&eb->refs_lock); } -/* - * Helper for releasing the extent buffer. - */ -static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { - btrfs_release_extent_buffer_page(eb, 0); - __free_extent_buffer(eb); + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + if (p != accessed) + mark_page_accessed(p); + } } -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_extent_buffer_accessed(eb, NULL); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3648,49 +4665,53 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; - struct address_space *mapping = tree->mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; int uptodate = 1; int ret; - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_page_accessed(eb->first_page); + eb = find_extent_buffer(fs_info, start); + if (eb) return eb; - } - rcu_read_unlock(); - eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); - if (!p) { - WARN_ON(1); + if (!p) goto free_eb; + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + page_cache_release(p); + mark_extent_buffer_accessed(exists, p); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3698,31 +4719,29 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; - spin_lock(&tree->buffer_lock); - ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); if (ret == -EEXIST) { - exists = radix_tree_lookup(&tree->buffer, - start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ - atomic_inc(&exists->refs); - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - goto free_eb; + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; } /* add one reference for the tree */ - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * there is a race where release page may have @@ -3733,52 +4752,111 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } - if (!atomic_dec_and_test(&eb->refs)) - return exists; + WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) { - struct extent_buffer *eb; + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_page_accessed(eb->first_page); - return eb; + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static int release_extent_buffer(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; + + spin_unlock(&eb->refs_lock); + + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); + } + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return 1; } - rcu_read_unlock(); + spin_unlock(&eb->refs_lock); - return NULL; + return 0; } void free_extent_buffer(struct extent_buffer *eb) { + int refs; + int old; if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3794,10 +4872,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3809,54 +4883,36 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } - return 0; + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) -{ - if (len < PAGE_CACHE_SIZE) - return 1; - if (start & (PAGE_CACHE_SIZE - 1)) - return 1; - if ((start + len) & (PAGE_CACHE_SIZE - 1)) - return 1; - return 0; -} - -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3865,92 +4921,24 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; } -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end) -{ - struct page *page; - int ret; - int pg_uptodate = 1; - int uptodate; - unsigned long index; - - if (__eb_straddles_pages(start, end - start + 1)) { - ret = test_range_bit(tree, start, end, - EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; - } - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - page = find_get_page(tree->mapping, index); - if (!page) - return 1; - uptodate = PageUptodate(page); - page_cache_release(page); - if (!uptodate) { - pg_uptodate = 0; - break; - } - start += PAGE_CACHE_SIZE; - } - return pg_uptodate; -} - -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -3964,21 +4952,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -3997,8 +4978,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4006,24 +4989,17 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num, &bio_flags); + mirror_num, &bio_flags, + READ | REQ_META); if (err) ret = err; } else { @@ -4031,8 +5007,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ | REQ_META, bio, mirror_num, + bio_flags); + if (err) + return err; + } if (ret || wait != WAIT_COMPLETE) return ret; @@ -4044,8 +5024,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4074,7 +5052,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4090,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, @@ -4115,10 +5130,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " - "wanted %lu %lu\n", (unsigned long long)eb->start, - eb->len, start, min_len); - WARN_ON(1); + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", + eb->start, eb->len, start, min_len); return -EINVAL; } @@ -4145,7 +5159,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4179,7 +5193,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4209,7 +5223,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4240,7 +5254,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(dst, i); @@ -4258,23 +5272,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = page_address(src_page); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - } -} - static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; @@ -4287,15 +5284,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4309,21 +5311,21 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; @@ -4356,16 +5358,16 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4374,13 +5376,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), + copy_pages(extent_buffer_page(dst, dst_i), extent_buffer_page(dst, src_i), dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); @@ -4391,47 +5393,44 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +int try_release_extent_buffer(struct page *page) { - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - btrfs_release_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); -out: - spin_unlock(&tree->buffer_lock); + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + return release_extent_buffer(eb); } |
