diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-01 19:43:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-01 19:43:53 -0700 |
commit | b33ce442993865180292df2a314ee5251ba38b50 (patch) | |
tree | 004b703ea3cd19c932393086fe9cde96e8db8de7 /drivers/md | |
parent | 7a48837732f87a574ee3e1855927dc250117f565 (diff) | |
parent | e84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (diff) |
Merge branch 'for-3.15/drivers' of git://git.kernel.dk/linux-block
Pull block driver update from Jens Axboe:
"On top of the core pull request, here's the pull request for the
driver related changes for 3.15. It contains:
- Improvements for msi-x registration for block drivers (mtip32xx,
skd, cciss, nvme) from Alexander Gordeev.
- A round of cleanups and improvements for drbd from Andreas
Gruenbacher and Rashika Kheria.
- A round of clanups and improvements for bcache from Kent.
- Removal of sleep_on() and friends in DAC960, ataflop, swim3 from
Arnd Bergmann.
- Bug fix for a bug in the mtip32xx async completion code from Sam
Bradshaw.
- Bug fix for accidentally bouncing IO on 32-bit platforms with
mtip32xx from Felipe Franciosi"
* 'for-3.15/drivers' of git://git.kernel.dk/linux-block: (103 commits)
bcache: remove nested function usage
bcache: Kill bucket->gc_gen
bcache: Kill unused freelist
bcache: Rework btree cache reserve handling
bcache: Kill btree_io_wq
bcache: btree locking rework
bcache: Fix a race when freeing btree nodes
bcache: Add a real GC_MARK_RECLAIMABLE
bcache: Add bch_keylist_init_single()
bcache: Improve priority_stats
bcache: Better alloc tracepoints
bcache: Kill dead cgroup code
bcache: stop moving_gc marking buckets that can't be moved.
bcache: Fix moving_pred()
bcache: Fix moving_gc deadlocking with a foreground write
bcache: Fix discard granularity
bcache: Fix another bug recovering from unclean shutdown
bcache: Fix a bug recovering from unclean shutdown
bcache: Fix a journalling reclaim after recovery bug
bcache: Fix a null ptr deref in journal replay
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/Kconfig | 8 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 173 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 56 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 592 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 36 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 46 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 18 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 201 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 19 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 64 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 155 | ||||
-rw-r--r-- | drivers/md/bcache/trace.c | 2 |
17 files changed, 629 insertions, 767 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 2638417b19a..4d200883c50 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. - -# cgroup code needs to be updated: -# -#config CGROUP_BCACHE -# bool "Cgroup controls for bcache" -# depends on BCACHE && BLK_CGROUP -# ---help--- -# TODO diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index c0d37d08244..443d03fbac4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); - if (CACHE_SYNC(&ca->set->sb)) { - ca->need_save_prio = max(ca->need_save_prio, - bucket_disk_gen(b)); - WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); - } - return ret; } @@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Allocation */ +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ static inline bool can_inc_bucket_gen(struct bucket *b) { - return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && - bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; } -bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); - - if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { - unsigned i; - - for (i = 0; i < RESERVE_NONE; i++) - if (!fifo_full(&ca->free[i])) - goto add; + BUG_ON(!ca->set->gc_mark_valid); - return false; - } -add: - b->prio = 0; - - if (can_inc_bucket_gen(b) && - fifo_push(&ca->unused, b - ca->buckets)) { - atomic_inc(&b->pin); - return true; - } - - return false; -} - -static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) -{ - return GC_MARK(b) == GC_MARK_RECLAIMABLE && + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) && can_inc_bucket_gen(b); } -static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, b - ca->buckets); + bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free_inc, b - ca->buckets); } @@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) ca->heap.used = 0; for_each_bucket(b, ca) { - /* - * If we fill up the unused list, if we then return before - * adding anything to the free_inc list we'll skip writing - * prios/gens and just go back to allocating from the unused - * list: - */ - if (fifo_full(&ca->unused)) - return; - - if (!can_invalidate_bucket(ca, b)) - continue; - - if (!GC_SECTORS_USED(b) && - bch_bucket_add_unused(ca, b)) + if (!bch_can_invalidate_bucket(ca, b)) continue; if (!heap_full(&ca->heap)) @@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca) return; } - invalidate_one_bucket(ca, b); + bch_invalidate_one_bucket(ca, b); } } @@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca) b = ca->buckets + ca->fifo_last_bucket++; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; @@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca) b = ca->buckets + n; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; @@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) static void invalidate_buckets(struct cache *ca) { - if (ca->invalidate_needs_gc) - return; + BUG_ON(ca->invalidate_needs_gc); switch (CACHE_REPLACEMENT(&ca->sb)) { case CACHE_REPLACEMENT_LRU: @@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca) invalidate_buckets_random(ca); break; } - - trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (1) { + while (!fifo_empty(&ca->free_inc)) { long bucket; - if ((!atomic_read(&ca->set->prio_blocked) || - !CACHE_SYNC(&ca->set->sb)) && - !fifo_empty(&ca->unused)) - fifo_pop(&ca->unused, bucket); - else if (!fifo_empty(&ca->free_inc)) - fifo_pop(&ca->free_inc, bucket); - else - break; + fifo_pop(&ca->free_inc, bucket); if (ca->discard) { mutex_unlock(&ca->set->bucket_lock); @@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg) } allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); } @@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg) * them to the free_inc list: */ +retry_invalidate: allocator_wait(ca, ca->set->gc_mark_valid && - (ca->need_save_prio > 64 || - !ca->invalidate_needs_gc)); + !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg) * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb) && - (!fifo_empty(&ca->free_inc) || - ca->need_save_prio > 64)) + if (CACHE_SYNC(&ca->set->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + bch_prio_write(ca); + } } } +/* Allocation */ + long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { DEFINE_WAIT(w); @@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) fifo_pop(&ca->free[reserve], r)) goto out; - if (!wait) + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); return -1; + } do { prepare_to_wait(&ca->set->bucket_wait, &w, @@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) out: wake_up_process(ca->alloc_thread); + trace_bcache_alloc(ca, reserve); + if (expensive_debug_checks(ca->set)) { size_t iter; long i; @@ -438,8 +423,6 @@ out: BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); - fifo_for_each(i, &ca->unused, iter) - BUG_ON(i == r); } b = ca->buckets + r; @@ -461,17 +444,19 @@ out: return r; } +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); +} + void bch_bucket_free(struct cache_set *c, struct bkey *k) { unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *b = PTR_BUCKET(c, k, i); - - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - SET_GC_SECTORS_USED(b, 0); - bch_bucket_add_unused(PTR_CACHE(c, k, i), b); - } + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(PTR_CACHE(c, k, i), + PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, @@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca) ca->alloc_thread = k; return 0; } - -int bch_cache_allocator_init(struct cache *ca) -{ - /* - * Reserve: - * Prio/gen writes first - * Then 8 for btree allocations - * Then half for the moving garbage collector - */ -#if 0 - ca->watermark[WATERMARK_PRIO] = 0; - - ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); - - ca->watermark[WATERMARK_MOVINGGC] = 8 + - ca->watermark[WATERMARK_METADATA]; - - ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + - ca->watermark[WATERMARK_MOVINGGC]; -#endif - return 0; -} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a4c7306ff43..82c9c5d3525 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -195,9 +195,7 @@ struct bucket { atomic_t pin; uint16_t prio; uint8_t gen; - uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint8_t gc_gen; uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; @@ -207,9 +205,9 @@ struct bucket { */ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); -#define GC_MARK_RECLAIMABLE 0 -#define GC_MARK_DIRTY 1 -#define GC_MARK_METADATA 2 +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 #define GC_SECTORS_USED_SIZE 13 #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); @@ -426,14 +424,9 @@ struct cache { * their new gen to disk. After prio_write() finishes writing the new * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) - * - * unused: GC found nothing pointing into these buckets (possibly - * because all the data they contained was overwritten), so we only - * need to discard them before they can be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); - DECLARE_FIFO(long, unused); size_t fifo_last_bucket; @@ -443,12 +436,6 @@ struct cache { DECLARE_HEAP(struct bucket *, heap); /* - * max(gen - disk_gen) for all buckets. When it gets too big we have to - * call prio_write() to keep gens from wrapping. - */ - uint8_t need_save_prio; - - /* * If nonzero, we know we aren't going to find any buckets to invalidate * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu @@ -562,19 +549,16 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned bucket_cache_used; + unsigned btree_cache_used; /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation. However, only one thread can be doing this - * at a time, for obvious reasons - try_harder and try_wait are - * basically a lock for this that we can wait on asynchronously. The - * btree_root() macro releases the lock when it returns. + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: */ - struct task_struct *try_harder; - wait_queue_head_t try_wait; - uint64_t try_harder_start; + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -603,7 +587,7 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - gc_gen) for all buckets. When it gets too big we have to gc + * max(gen - last_gc) for all buckets. When it gets too big we have to gc * to keep gens from wrapping around. */ uint8_t need_gc; @@ -628,6 +612,8 @@ struct cache_set { /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; + struct workqueue_struct *moving_gc_wq; + struct btree *root; #ifdef CONFIG_BCACHE_DEBUG @@ -667,7 +653,6 @@ struct cache_set { struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; - struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; @@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc) /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). - * - * bucket_disk_gen() returns the difference between the current gen and the gen - * on disk; they're both used to make sure gens don't wrap around. */ static inline uint8_t bucket_gc_gen(struct bucket *b) @@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) return b->gen - b->last_gc; } -static inline uint8_t bucket_disk_gen(struct bucket *b) -{ - return b->gen - b->disk_gen; -} - #define BUCKET_GC_GEN_MAX 96U -#define BUCKET_DISK_GEN_MAX 64U #define kobj_attribute_write(n, fn) \ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) @@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); -bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, bool); +bool bch_can_invalidate_bucket(struct cache *, struct bucket *); +void __bch_invalidate_one_bucket(struct cache *, struct bucket *); + +void __bch_bucket_free(struct cache *, struct bucket *); void bch_bucket_free(struct cache_set *, struct bkey *); +long bch_bucket_alloc(struct cache *, unsigned, bool); int __bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, @@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *); void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); void bch_request_exit(void); int bch_request_init(void); -void bch_btree_exit(void); -int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 3f74b4b0747..54541641530 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); - printk(KERN_ERR "block %u key %li/%u: ", set, - (uint64_t *) k - i->d, i->keys); + printk(KERN_ERR "block %u key %u/%u: ", set, + (unsigned) ((u64 *) k - i->d), i->keys); if (b->ops->key_dump) b->ops->key_dump(b, k); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 003260f4ddf..5f6728d5d4d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l) l->top_p = l->keys_p = l->inline_keys; } +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} + static inline void bch_keylist_push(struct keylist *l) { l->top = bkey_next(l->top); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5f9c2a665ca..7347b610096 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -68,15 +68,11 @@ * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * - * Make sure all allocations get charged to the root cgroup - * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * - * Also lookup by cgroup in get_open_bucket() - * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight @@ -97,8 +93,6 @@ #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -static struct workqueue_struct *btree_io_wq; - #define insert_lock(s, b) ((b)->level <= (s)->lock) /* @@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq; ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq; _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ if (_r == -EINTR) \ schedule(); \ - bch_cannibalize_unlock(c); \ - if (_r == -ENOSPC) { \ - wait_event((c)->try_wait, \ - !(c)->try_harder); \ - _r = -EINTR; \ - } \ } while (_r == -EINTR); \ \ - finish_wait(&(c)->bucket_wait, &(op)->wait); \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ _r; \ }) @@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b) return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); } +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); + +} + /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); + schedule_delayed_work(&b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } @@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b) } } -void bch_btree_node_write(struct btree *b, struct closure *parent) +void __bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = btree_bset_last(b); + lockdep_assert_held(&b->write_lock); + trace_bcache_btree_write(b); BUG_ON(current->bio_list); @@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c)); +} - /* If not a leaf node, always sort */ - if (b->level && b->keys.nsets) - bch_btree_sort(&b->keys, &b->c->sort); - else - bch_btree_sort_lazy(&b->keys, &b->c->sort); +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); /* * do verify if there was more than one set initially (i.e. we did a * sort) and we sorted down to a single set: */ - if (i != b->keys.set->data && !b->keys.nsets) + if (nsets && !b->keys.nsets) bch_btree_verify(b); - if (b->written < btree_blocks(b)) - bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bch_btree_init_next(b); } static void bch_btree_node_write_sync(struct btree *b) @@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b) struct closure cl; closure_init_stack(&cl); + + mutex_lock(&b->write_lock); bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + closure_sync(&cl); } @@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) @@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); + lockdep_assert_held(&b->write_lock); + BUG_ON(!b->written); BUG_ON(!i->keys); if (!btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + schedule_delayed_work(&b->work, 30 * HZ); set_btree_node_dirty(b); @@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ - max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) static void mca_data_free(struct btree *b) { @@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b) bch_btree_keys_free(&b->keys); - b->c->bucket_cache_used--; + b->c->btree_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); } @@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) ilog2(b->c->btree_pages), btree_order(k)), gfp)) { - b->c->bucket_cache_used++; + b->c->btree_cache_used++; list_move(&b->list, &b->c->btree_cache); } else { list_move(&b->list, &b->c->btree_cache_freed); @@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; @@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) up(&b->io_mutex); } + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write_sync(b); + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); /* wait for any in flight btree write */ down(&b->io_mutex); @@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, if (c->shrinker_disabled) return SHRINK_STOP; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } } - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { + for (i = 0; (nr--) && i < c->btree_cache_used; i++) { if (list_empty(&c->btree_cache)) goto out; @@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, if (c->shrinker_disabled) return 0; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return 0; return mca_can_free(c) * c->btree_pages; @@ -819,17 +835,30 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + struct task_struct *old; + + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old && old != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + return -EINTR; + } + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) { struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!c->try_harder) { - c->try_harder = current; - c->try_harder_start = local_clock(); - } else if (c->try_harder != current) - return ERR_PTR(-ENOSPC); + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); list_for_each_entry_reverse(b, &c->btree_cache, list) if (!mca_reap(b, btree_order(k), false)) @@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) if (!mca_reap(b, btree_order(k), true)) return b; + WARN(1, "btree cache cannibalize failed\n"); return ERR_PTR(-ENOMEM); } @@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) */ static void bch_cannibalize_unlock(struct cache_set *c) { - if (c->try_harder == current) { - bch_time_stats_update(&c->try_harder_time, c->try_harder_start); - c->try_harder = NULL; - wake_up(&c->try_wait); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) { struct btree *b; @@ -920,7 +950,7 @@ err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k); + b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out; @@ -936,8 +966,8 @@ err: * The btree node will have either a read or a write lock held, depending on * level and op->lock. */ -struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, bool write) +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write) { int i = 0; struct btree *b; @@ -951,7 +981,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, op, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, NULL, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) static void btree_node_free(struct btree *b) { - unsigned i; - trace_bcache_btree_node_free(b); BUG_ON(b == b->c->root); + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); + mutex_unlock(&b->write_lock); + cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); - - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), - PTR_BUCKET(b->c, &b->key, i)); - } - bch_bucket_free(b->c, &b->key); mca_bucket_free(b); mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.k |