diff options
Diffstat (limited to 'drivers/md/bcache')
| -rw-r--r-- | drivers/md/bcache/Kconfig | 8 | ||||
| -rw-r--r-- | drivers/md/bcache/Makefile | 5 | ||||
| -rw-r--r-- | drivers/md/bcache/alloc.c | 190 | ||||
| -rw-r--r-- | drivers/md/bcache/bcache.h | 115 | ||||
| -rw-r--r-- | drivers/md/bcache/bset.c | 919 | ||||
| -rw-r--r-- | drivers/md/bcache/bset.h | 446 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.c | 1125 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.h | 83 | ||||
| -rw-r--r-- | drivers/md/bcache/closure.h | 2 | ||||
| -rw-r--r-- | drivers/md/bcache/debug.c | 240 | ||||
| -rw-r--r-- | drivers/md/bcache/debug.h | 27 | ||||
| -rw-r--r-- | drivers/md/bcache/extents.c | 620 | ||||
| -rw-r--r-- | drivers/md/bcache/extents.h | 13 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.c | 61 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.h | 1 | ||||
| -rw-r--r-- | drivers/md/bcache/movinggc.c | 18 | ||||
| -rw-r--r-- | drivers/md/bcache/request.c | 230 | ||||
| -rw-r--r-- | drivers/md/bcache/request.h | 19 | ||||
| -rw-r--r-- | drivers/md/bcache/stats.c | 3 | ||||
| -rw-r--r-- | drivers/md/bcache/super.c | 83 | ||||
| -rw-r--r-- | drivers/md/bcache/sysfs.c | 193 | ||||
| -rw-r--r-- | drivers/md/bcache/trace.c | 2 | ||||
| -rw-r--r-- | drivers/md/bcache/util.h | 8 |
23 files changed, 2400 insertions, 2011 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 2638417b19a..4d200883c50 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. - -# cgroup code needs to be updated: -# -#config CGROUP_BCACHE -# bool "Cgroup controls for bcache" -# depends on BCACHE && BLK_CGROUP -# ---help--- -# TODO diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c82523be..c488b846f83 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ - movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index bcfd96e2121..443d03fbac4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); - if (CACHE_SYNC(&ca->set->sb)) { - ca->need_save_prio = max(ca->need_save_prio, - bucket_disk_gen(b)); - WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); - } - return ret; } @@ -120,56 +114,63 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Allocation */ +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ static inline bool can_inc_bucket_gen(struct bucket *b) { - return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && - bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; } -bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); - - if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { - unsigned i; - - for (i = 0; i < RESERVE_NONE; i++) - if (!fifo_full(&ca->free[i])) - goto add; - - return false; - } -add: - b->prio = 0; - - if (can_inc_bucket_gen(b) && - fifo_push(&ca->unused, b - ca->buckets)) { - atomic_inc(&b->pin); - return true; - } - - return false; -} + BUG_ON(!ca->set->gc_mark_valid); -static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) -{ - return GC_MARK(b) == GC_MARK_RECLAIMABLE && + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) && can_inc_bucket_gen(b); } -static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, b - ca->buckets); + bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free_inc, b - ca->buckets); } -#define bucket_prio(b) \ - (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) @@ -182,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) ca->heap.used = 0; for_each_bucket(b, ca) { - /* - * If we fill up the unused list, if we then return before - * adding anything to the free_inc list we'll skip writing - * prios/gens and just go back to allocating from the unused - * list: - */ - if (fifo_full(&ca->unused)) - return; - - if (!can_invalidate_bucket(ca, b)) - continue; - - if (!GC_SECTORS_USED(b) && - bch_bucket_add_unused(ca, b)) + if (!bch_can_invalidate_bucket(ca, b)) continue; if (!heap_full(&ca->heap)) @@ -220,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca) return; } - invalidate_one_bucket(ca, b); + bch_invalidate_one_bucket(ca, b); } } @@ -236,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca) b = ca->buckets + ca->fifo_last_bucket++; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; @@ -261,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca) b = ca->buckets + n; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; @@ -274,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) static void invalidate_buckets(struct cache *ca) { - if (ca->invalidate_needs_gc) - return; + BUG_ON(ca->invalidate_needs_gc); switch (CACHE_REPLACEMENT(&ca->sb)) { case CACHE_REPLACEMENT_LRU: @@ -288,8 +275,6 @@ static void invalidate_buckets(struct cache *ca) invalidate_buckets_random(ca); break; } - - trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -337,17 +322,10 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (1) { + while (!fifo_empty(&ca->free_inc)) { long bucket; - if ((!atomic_read(&ca->set->prio_blocked) || - !CACHE_SYNC(&ca->set->sb)) && - !fifo_empty(&ca->unused)) - fifo_pop(&ca->unused, bucket); - else if (!fifo_empty(&ca->free_inc)) - fifo_pop(&ca->free_inc, bucket); - else - break; + fifo_pop(&ca->free_inc, bucket); if (ca->discard) { mutex_unlock(&ca->set->bucket_lock); @@ -358,6 +336,7 @@ static int bch_allocator_thread(void *arg) } allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); } @@ -367,9 +346,9 @@ static int bch_allocator_thread(void *arg) * them to the free_inc list: */ +retry_invalidate: allocator_wait(ca, ca->set->gc_mark_valid && - (ca->need_save_prio > 64 || - !ca->invalidate_needs_gc)); + !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -377,13 +356,28 @@ static int bch_allocator_thread(void *arg) * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb) && - (!fifo_empty(&ca->free_inc) || - ca->need_save_prio > 64)) + if (CACHE_SYNC(&ca->set->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + bch_prio_write(ca); + } } } +/* Allocation */ + long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { DEFINE_WAIT(w); @@ -395,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) fifo_pop(&ca->free[reserve], r)) goto out; - if (!wait) + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); return -1; + } do { prepare_to_wait(&ca->set->bucket_wait, &w, @@ -412,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) out: wake_up_process(ca->alloc_thread); + trace_bcache_alloc(ca, reserve); + if (expensive_debug_checks(ca->set)) { size_t iter; long i; @@ -425,8 +423,6 @@ out: BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); - fifo_for_each(i, &ca->unused, iter) - BUG_ON(i == r); } b = ca->buckets + r; @@ -448,17 +444,19 @@ out: return r; } +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); +} + void bch_bucket_free(struct cache_set *c, struct bkey *k) { unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *b = PTR_BUCKET(c, k, i); - - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - SET_GC_SECTORS_USED(b, 0); - bch_bucket_add_unused(PTR_CACHE(c, k, i), b); - } + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(PTR_CACHE(c, k, i), + PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, @@ -696,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca) ca->alloc_thread = k; return 0; } - -int bch_cache_allocator_init(struct cache *ca) -{ - /* - * Reserve: - * Prio/gen writes first - * Then 8 for btree allocations - * Then half for the moving garbage collector - */ -#if 0 - ca->watermark[WATERMARK_PRIO] = 0; - - ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); - - ca->watermark[WATERMARK_MOVINGGC] = 8 + - ca->watermark[WATERMARK_METADATA]; - - ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + - ca->watermark[WATERMARK_MOVINGGC]; -#endif - return 0; -} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d955a493461..d2ebcf32309 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -187,6 +187,7 @@ #include <linux/types.h> #include <linux/workqueue.h> +#include "bset.h" #include "util.h" #include "closure.h" @@ -194,9 +195,7 @@ struct bucket { atomic_t pin; uint16_t prio; uint8_t gen; - uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint8_t gc_gen; uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; @@ -206,10 +205,12 @@ struct bucket { */ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); -#define GC_MARK_RECLAIMABLE 0 -#define GC_MARK_DIRTY 1 -#define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 +#define GC_SECTORS_USED_SIZE 13 +#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" @@ -423,14 +424,9 @@ struct cache { * their new gen to disk. After prio_write() finishes writing the new * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) - * - * unused: GC found nothing pointing into these buckets (possibly - * because all the data they contained was overwritten), so we only - * need to discard them before they can be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); - DECLARE_FIFO(long, unused); size_t fifo_last_bucket; @@ -440,12 +436,6 @@ struct cache { DECLARE_HEAP(struct bucket *, heap); /* - * max(gen - disk_gen) for all buckets. When it gets too big we have to - * call prio_write() to keep gens from wrapping. - */ - uint8_t need_save_prio; - - /* * If nonzero, we know we aren't going to find any buckets to invalidate * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu @@ -559,19 +549,16 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned bucket_cache_used; + unsigned btree_cache_used; /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation. However, only one thread can be doing this - * at a time, for obvious reasons - try_harder and try_wait are - * basically a lock for this that we can wait on asynchronously. The - * btree_root() macro releases the lock when it returns. + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: */ - struct task_struct *try_harder; - wait_queue_head_t try_wait; - uint64_t try_harder_start; + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -600,7 +587,7 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - gc_gen) for all buckets. When it gets too big we have to gc + * max(gen - last_gc) for all buckets. When it gets too big we have to gc * to keep gens from wrapping around. */ uint8_t need_gc; @@ -625,10 +612,13 @@ struct cache_set { /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; + struct workqueue_struct *moving_gc_wq; + struct btree *root; #ifdef CONFIG_BCACHE_DEBUG struct btree *verify_data; + struct bset *verify_ondisk; struct mutex verify_lock; #endif @@ -644,13 +634,7 @@ struct cache_set { */ mempool_t *fill_iter; - /* - * btree_sort() is a merge sort and requires temporary space - single - * element mempool - */ - struct mutex sort_lock; - struct bset *sort; - unsigned sort_crit_factor; + struct bset_sort_state sort; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -666,11 +650,9 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; - struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; @@ -684,9 +666,9 @@ struct cache_set { unsigned error_decay; unsigned short journal_delay_ms; + bool expensive_debug_checks; unsigned verify:1; unsigned key_merging_disabled:1; - unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -708,13 +690,8 @@ struct bbio { struct bio bio; }; -static inline unsigned local_clock_us(void) -{ - return local_clock() >> 10; -} - #define BTREE_PRIO USHRT_MAX -#define INITIAL_PRIO 32768 +#define INITIAL_PRIO 32768U #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ @@ -727,17 +704,6 @@ static inline unsigned local_clock_us(void) #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) -#define set_bytes(i) __set_bytes(i, i->keys) - -#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) -#define set_blocks(i, c) __set_blocks(i, (i)->keys, c) - -#define node(i, j) ((struct bkey *) ((i)->d + (j))) -#define end(i) node(i, (i)->keys) - -#define btree_data_space(b) (PAGE_SIZE << (b)->page_order) - #define prios_per_bucket(c) \ ((bucket_bytes(c) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) @@ -780,20 +746,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } -/* Btree key macros */ +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} -static inline void bkey_init(struct bkey *k) +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) { - *k = ZERO_KEY; + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); } +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); +} + +/* Btree key macros */ + /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs */ #define csum_set(i) \ bch_crc64(((void *) (i)) + sizeof(uint64_t), \ - ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) /* Error handling macros */ @@ -848,16 +828,13 @@ static inline bool cached_dev_get(struct cached_dev *dc) return false; /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return true; } /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). - * - * bucket_disk_gen() returns the difference between the current gen and the gen - * on disk; they're both used to make sure gens don't wrap around. */ static inline uint8_t bucket_gc_gen(struct bucket *b) @@ -865,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) return b->gen - b->last_gc; } -static inline uint8_t bucket_disk_gen(struct bucket *b) -{ - return b->gen - b->disk_gen; -} - #define BUCKET_GC_GEN_MAX 96U -#define BUCKET_DISK_GEN_MAX 64U #define kobj_attribute_write(n, fn) \ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) @@ -904,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); -bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, bool); +bool bch_can_invalidate_bucket(struct cache *, struct bucket *); +void __bch_invalidate_one_bucket(struct cache *, struct bucket *); + +void __bch_bucket_free(struct cache *, struct bucket *); void bch_bucket_free(struct cache_set *, struct bkey *); +long bch_bucket_alloc(struct cache *, unsigned, bool); int __bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, @@ -959,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *); void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); void bch_request_exit(void); int bch_request_init(void); -void bch_btree_exit(void); -int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e51a739f751..54541641530 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -5,30 +5,134 @@ * Copyright 2012 Google, Inc. */ -#include "bcache.h" -#include "btree.h" -#include "debug.h" +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include "util.h" +#include "bset.h" + +#include <linux/console.h> #include <linux/random.h> #include <linux/prefetch.h> +#ifdef CONFIG_BCACHE_DEBUG + +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +{ + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + printk(KERN_ERR "block %u key %u/%u: ", set, + (unsigned) ((u64 *) k - i->d), i->keys); + + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + printk(KERN_ERR "Key skipped backwards\n"); + } +} + +void bch_dump_bucket(struct btree_keys *b) +{ + unsigned i; + + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} + +int __bch_count_data(struct btree_keys *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; + + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + panic("bch_check_keys error: %s:\n", err); +} + +static void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} + +#else + +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#endif + /* Keylists */ -int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) +int __bch_keylist_realloc(struct keylist *l, unsigned u64s) { size_t oldsize = bch_keylist_nkeys(l); - size_t newsize = oldsize + 2 + nptrs; + size_t newsize = oldsize + u64s; uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; uint64_t *new_keys; - /* The journalling code doesn't handle the case where the keys to insert - * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far - * and finish the rest when the keylist is empty. - */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) - return -ENOMEM; - newsize = roundup_pow_of_two(newsize); if (newsize <= KEYLIST_INLINE || @@ -71,140 +175,6 @@ void bch_keylist_pop_front(struct keylist *l) bch_keylist_bytes(l)); } -/* Pointer validation */ - -static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size || - bucket < ca->sb.first_bucket || - bucket >= ca->sb.nbuckets) - return true; - } - - return false; -} - -bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_SIZE(k)) - return true; - - if (KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, - unsigned ptr) -{ - struct bucket *g = PTR_BUCKET(b->c, k, ptr); - char buf[80]; - - if (mutex_trylock(&b->c->bucket_lock)) { - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto err; - - } else { - if (g->prio == BTREE_PRIO) - goto err; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto err; - } - mutex_unlock(&b->c->bucket_lock); - } - - return false; -err: - mutex_unlock(&b->c->bucket_lock); - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - return true; -} - -bool bch_ptr_bad(struct btree *b, const struct bkey *k) -{ - struct bucket *g; - unsigned i, stale; - - if (!bkey_cmp(k, &ZERO_KEY) || - !KEY_PTRS(k) || - bch_ptr_invalid(b, k)) - return true; - - for (i = 0; i < KEY_PTRS(k); i++) - if (!ptr_available(b->c, k, i)) - return true; - - if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) - return false; - - for (i = 0; i < KEY_PTRS(k); i++) { - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); - - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); - - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); - - if (stale) - return true; - - if (expensive_debug_checks(b->c) && - ptr_bad_expensive_checks(b, k, i)) - return true; - } - - return false; -} - /* Key/pointer manipulation */ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, @@ -259,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) return true; } -static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree_keys *b) { - return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & - ~((uint64_t)1 << 63); + return PAGE_SIZE << b->page_order; } -/* Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) +static inline size_t btree_keys_cachelines(struct btree_keys *b) { - unsigned i; + return btree_keys_bytes(b) / BSET_CACHELINE; +} - if (key_merging_disabled(b->c)) - return false; +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} - if (KEY_PTRS(l) != KEY_PTRS(r) || - KEY_DIRTY(l) != KEY_DIRTY(r) || - bkey_cmp(l, &START_KEY(r))) - return false; +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} - for (i = 0; i < KEY_PTRS(l); i++) - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || - PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) - return false; +/* Memory allocation */ - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { - SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); - SET_KEY_SIZE(l, USHRT_MAX); +void bch_btree_keys_free(struct btree_keys *b) +{ + struct bset_tree *t = b->set; - bch_cut_front(l, r); - return false; - } + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); - if (KEY_CSUM(l)) { - if (KEY_CSUM(r)) - l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); - else - SET_KEY_CSUM(l, 0); - } + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); - SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); - SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + free_pages((unsigned long) t->data, b->page_order); - return true; + t->prev = NULL; + t->tree = NULL; + t->data = NULL; } +EXPORT_SYMBOL(bch_btree_keys_free); + +int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +{ + struct bset_tree *t = b->set; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; +} +EXPORT_SYMBOL(bch_btree_keys_alloc); + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + unsigned i; + + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* XXX: shouldn't be needed */ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].size = 0; + /* + * Second loop starts at 1 because b->keys[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->set[i].data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -459,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bkey *k) +static unsigned bkey_to_cacheline_offset(struct bset_tree *t, + unsigned cacheline, + struct bkey *k) { - return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) @@ -508,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) : tree_to_prev_bkey(t, j >> ffs(j)); struct bkey *r = is_power_of_2(j + 1) - ? node(t->data, t->data->keys - bkey_u64s(&t->end)) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) : tree_to_bkey(t, j >> (ffz(j) + 1)); BUG_ON(m < l || m > r); @@ -532,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) f->exponent = 127; } -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { - if (t != b->sets) { + if (t != b->set) { unsigned j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); @@ -542,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) t->prev = t[-1].prev + j; } - while (t < b->sets + MAX_BSETS) + while (t < b->set + MAX_BSETS) t++->size = 0; } -static void bset_build_unwritten_tree(struct btree *b) +static void bch_bset_build_unwritten_tree(struct btree_keys *b) { - struct bset_tree *t = b->sets + b->nsets; + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; bset_alloc_tree(b, t); - if (t->tree != b->sets->tree + bset_tree_space(b)) { - t->prev[0] = bkey_to_cacheline_offset(t->data->start); + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); t->size = 1; } } -static void bset_build_written_tree(struct btree *b) +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) { - struct bset_tree *t = b->sets + b->nsets; - struct bkey *k = t->data->start; + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} +EXPORT_SYMBOL(bch_bset_init_next); + +void bch_bset_build_written_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct bkey *prev = NULL, *k = t->data->start; unsigned j, cacheline = 1; + b->last_set_unwritten = 0; + bset_alloc_tree(b, t); t->size = min_t(unsigned, - bkey_to_cacheline(t, end(t->data)), - b->sets->tree + bset_tree_space(b) - t->tree); + bkey_to_cacheline(t, bset_bkey_last(t->data)), + b->set->tree + btree_keys_cachelines(b) - t->tree); if (t->size < 2) { t->size = 0; @@ -581,16 +656,14 @@ static void bset_build_written_tree(struct btree *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) != cacheline) - k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) + prev = k, k = bkey_next(k); - t->prev[j] = bkey_u64s(k); - k = bkey_next(k); - cacheline++; - t->tree[j].m = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); } - while (bkey_next(k) != end(t->data)) + while (bkey_next(k) != bset_bkey_last(t->data)) k = bkey_next(k); t->end = *k; @@ -601,14 +674,17 @@ static void bset_build_written_tree(struct btree *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } +EXPORT_SYMBOL(bch_bset_build_written_tree); + +/* Insert */ -void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; unsigned inorder, j = 1; - for (t = b->sets; t <= &b->sets[b->nsets]; t++) - if (k < end(t->data)) + for (t = b->set; t <= bset_tree_last(b); t++) + if (k < bset_bkey_last(t->data)) goto found_set; BUG(); @@ -621,7 +697,7 @@ found_set: if (k == t->data->start) goto fix_left; - if (bkey_next(k) == end(t->data)) { + if (bkey_next(k) == bset_bkey_last(t->data)) { t->end = *k; goto fix_right; } @@ -646,10 +722,12 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } +EXPORT_SYMBOL(bch_bset_fix_invalidated_key); -void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) +static void bch_bset_fix_lookup_table(struct btree_keys *b, + struct bset_tree *t, + struct bkey *k) { - struct bset_tree *t = &b->sets[b->nsets]; unsigned shift = bkey_u64s(k); unsigned j = bkey_to_cacheline(t, k); @@ -661,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ - if (j < t->size && - table_to_bkey(t, j) <= k) + while (j < t->size && + table_to_bkey(t, j) <= k) j++; /* Adjust all the lookup table entries, and find a new key for any that @@ -677,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) while (k < cacheline_to_bkey(t, j, 0)) k = bkey_next(k); - t->prev[j] = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_to_cacheline_offset(t, j, k); } } - if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) return; /* Possibly add a new entry to the end of the lookup table */ for (k = table_to_bkey(t, t->size - 1); - k != end(t->data); + k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(k); + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); t->size++; } } -void bch_bset_init_next(struct btree *b) +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) { - struct bset *i = write_block(b); + if (!b->ops->key_merge) + return false; - if (i != b->sets[0].data) { - b->sets[++b->nsets].data = i; - i->seq = b->sets[0].data->seq; - } else - get_random_bytes(&i->seq, sizeof(uint64_t)); + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) + return false; - i->magic = bset_magic(&b->c->sb); - i->version = 0; - i->keys = 0; + return b->ops->key_merge(b, l, r); +} +EXPORT_SYMBOL(bch_bkey_try_merge); + +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(!b->last_set_unwritten); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); + + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); +} +EXPORT_SYMBOL(bch_bset_insert); + +unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; - bset_build_unwritten_tree(b); + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + m = bch_btree_iter_init(b, &iter, b->ops->is_extents + ? PRECEDING_KEY(&START_KEY(k)) + : PRECEDING_KEY(k)); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; + + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) + prev = m, m = bkey_next(m); + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; } +EXPORT_SYMBOL(bch_btree_insert_key); + +/* Lookup */ struct bset_search_iter { struct bkey *l, *r; }; -static struct bset_search_iter bset_search_write_set(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { unsigned li = 0, ri = t->size; - BUG_ON(!b->nsets && - t->size < bkey_to_cacheline(t, end(t->data))); - while (li + 1 != ri) { unsigned m = (li + ri) >> 1; @@ -736,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, return (struct bset_search_iter) { table_to_bkey(t, li), - ri < t->size ? table_to_bkey(t, ri) : end(t->data) + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) }; } -static struct bset_search_iter bset_search_tree(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_tree(struct bset_tree *t, const struct bkey *search) { struct bkey *l, *r; @@ -788,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, f = &t->tree[inorder_next(j, t->size)]; r = cacheline_to_bkey(t, inorder, f->m); } else - r = end(t->data); + r = bset_bkey_last(t->data); } else { r = cacheline_to_bkey(t, inorder, f->m); @@ -802,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, return (struct bset_search_iter) {l, r}; } -struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search) { struct bset_search_iter i; @@ -824,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; - i.r = end(t->data); + i.r = bset_bkey_last(t->data); } else if (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range @@ -834,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, */ if (unlikely(bkey_cmp(search, &t->end) >= 0)) - return end(t->data); + return bset_bkey_last(t->data); if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start; - i = bset_search_tree(b, t, search); - } else - i = bset_search_write_set(b, t, search); + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); + + i = bset_search_write_set(t, search); + } - if (expensive_debug_checks(b->c)) { + if (btree_keys_expensive_checks(b)) { BUG_ON(bset_written(b, t) && i.l != t->data->start && bkey_cmp(tree_to_prev_bkey(t, inorder_to_tree(bkey_to_cacheline(t, i.l), t)), search) > 0); - BUG_ON(i.r != end(t->data) && + BUG_ON(i.r != bset_bkey_last(t->data) && bkey_cmp(i.r, search) <= 0); } @@ -860,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, return i.l; } +EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - - return c ? c > 0 : l.k < r.k; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) @@ -892,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); @@ -903,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, iter->b = b; #endif - for (; start <= &b->sets[b->nsets]; start++) { + for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, end(start->data)); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_next(struct btree_iter *iter) +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->set); +} +EXPORT_SYMBOL(bch_btree_iter_init); + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) { struct btree_iter_set unused; struct bkey *ret = NULL; @@ -928,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) } if (iter->data->k == iter->data->end) - heap_pop(iter, unused, btree_iter_cmp); + heap_pop(iter, unused, cmp); else - heap_sift(iter, 0, btree_iter_cmp); + heap_sift(iter, 0, cmp); } return ret; } +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} +EXPORT_SYMBOL(bch_btree_iter_next); + struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, - struct btree *b, ptr_filter_fn fn) + struct btree_keys *b, ptr_filter_fn fn) { struct bkey *ret; @@ -950,79 +1114,50 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, /* Mergesort */ -static void sort_key_next(struct btree_iter *iter, - struct btree_iter_set *i) +void bch_bset_sort_state_free(struct bset_sort_state *state) { - i->k = bkey_next(i->k); - - if (i->k == i->end) - *i = iter->data[--iter->used]; + if (state->pool) + mempool_destroy(state->pool); } -static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) +int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) { - while (iter->used > 1) { - struct btree_iter_set *top = iter->data, *i = top + 1; + spin_lock_init(&state->time.lock); - if (iter->used > 2 && - btree_iter_cmp(i[0], i[1])) - i++; - - if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) - break; + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); - if (!KEY_SIZE(i->k)) { - sort_key_next(iter, i); - heap_sift(iter, i - top, btree_iter_cmp); - continue; - } - - if (top->k > i->k) { - if (bkey_cmp(top->k, i->k) >= 0) - sort_key_next(iter, i); - else - bch_cut_front(top->k, i->k); - - heap_sift(iter, i - top, btree_iter_cmp); - } else { - /* can't happen because of comparison func */ - BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); - - if (bkey_cmp(i->k, top->k) < 0) { - bkey_copy(tmp, top->k); - - bch_cut_back(&START_KEY(i->k), tmp); - bch_cut_front(i->k, top->k); - heap_sift(iter, 0, btree_iter_cmp); - - return tmp; - } else { - bch_cut_back(&START_KEY(i->k), top->k); - } - } - } + state->pool = mempool_create_page_pool(1, page_order); + if (!state->pool) + return -ENOMEM; - return NULL; + return 0; } +EXPORT_SYMBOL(bch_bset_sort_state_init); -static void btree_mergesort(struct btree *b, struct bset *out, +static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; - bool (*bad)(struct btree *, const struct bkey *) = remove_stale + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); + while (!btree_iter_end(iter)) { - if (fixup && !b->level) - k = btree_sort_fixup(iter, &tmp.k); + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); else k = NULL; if (!k) - k = bch_btree_iter_next(iter); + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); if (bad(b, k)) continue; @@ -1030,8 +1165,7 @@ static void btree_mergesort(struct btree *b, struct bset *out, if (!last) { last = out->start; bkey_copy(last, k); - } else if (b->level || - !bch_bkey_try_merge(b, last, k)) { + } else if (!bch_bkey_try_merge(b, last, k)) { last = bkey_next(last); bkey_copy(last, k); } @@ -1042,27 +1176,30 @@ static void btree_mergesort(struct btree *b, struct bset *out, pr_debug("sorted %i keys", out->keys); } -static void __btree_sort(struct btree *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup) +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, + unsigned start, unsigned order, bool fixup, + struct bset_sort_state *state) { uint64_t start_time; - bool remove_stale = !b->written; + bool used_mempool = false; struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { - mutex_lock(&b->c->sort_lock); - out = b->c->sort; - order = ilog2(bucket_pages(b->c)); + struct page *outp; + + BUG_ON(order > state->page_order); + + outp = mempool_alloc(state->pool, GFP_NOIO); + out = page_address(outp); + used_mempool = true; + order = state->page_order; } start_time = local_clock(); - btree_mergesort(b, out, iter, fixup, remove_stale); + btree_mergesort(b, out, iter, fixup, false); b->nsets = start; - if (!fixup && !start && b->written) - bch_btree_verify(b, out); - if (!start && order == b->page_order) { /* * Our temporary buffer is the same size as the btree node's @@ -1070,84 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(&b->c->sb); - out->seq = b->sets[0].data->seq; - out->version = b->sets[0].data->version; - swap(out, b->sets[0].data); - - if (b->c->sort == b->sets[0].data) - b->c->sort = out; + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); } else { - b->sets[start].data->keys = out->keys; - memcpy(b->sets[start].data->start, out->start, - (void *) end(out) - (void *) out->start); + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, + (void *) bset_bkey_last(out) - (void *) out->start); } - if (out == b->c->sort) - mutex_unlock(&b->c->sort_lock); + if (used_mempool) + mempool_free(virt_to_page(out), state->pool); else free_pages((unsigned long) out, order); - if (b->written) - bset_build_written_tree(b); + bch_bset_build_written_tree(b); if (!start) - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree *b, unsigned start) +void bch_btree_sort_partial(struct btree_keys *b, unsigned start, + struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; struct btree_iter iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); - - BUG_ON(b->sets[b->nsets].data == write_block(b) && - (b->sets[b->nsets].size || b->nsets)); - + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned i; for (i = start; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + keys += b->set[i].data->keys; - order = roundup_pow_of_two(__set_bytes(b->sets->data, - keys)) / PAGE_SIZE; - if (order) - order = ilog2(order); + order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false); + __btree_sort(b, &iter, start, order, false, state); - EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } +EXPORT_SYMBOL(bch_btree_sort_partial); -void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state) { - BUG_ON(!b->written); - __btree_sort(b, iter, 0, b->page_order, true); + __btree_sort(b, iter, 0, b->page_order, true, state); } -void bch_btree_sort_into(struct btree *b, struct btree *new) +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state) { uint64_t start_time = local_clock(); struct btree_iter iter; bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(b, new->sets->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter, false, true); - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); - bkey_copy_key(&new->key, &b->key); - new->sets->size = 0; + new->set->size = 0; // XXX: why? } #define SORT_CRIT (4096 / sizeof(uint64_t)) -void bch_btree_sort_lazy(struct btree *b) +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { unsigned crit = SORT_CRIT; int i; @@ -1156,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b) if (!b->nsets) goto out; - /* If not a leaf node, always sort */ - if (b->level) { - bch_btree_sort(b); - return; - } - for (i = b->nsets - 1; i >= 0; --i) { - crit *= b->c->sort_crit_factor; + crit *= state->crit_factor; - if (b->sets[i].data->keys < crit) { - bch_btree_sort_partial(b, i); + if (b->set[i].data->keys < crit) { + bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ if (b->nsets + 1 == MAX_BSETS) { - bch_btree_sort(b); + bch_btree_sort(b, state); return; } out: - bset_build_written_tree(b); + bch_bset_build_written_tree(b); } +EXPORT_SYMBOL(bch_btree_sort_lazy); -/* Sysfs stuff */ - -struct bset_stats { - struct btree_op op; - size_t nodes; - size_t sets_written, sets_unwritten; - size_t bytes_written, bytes_unwritten; - size_t floats, failed; -}; - -static int btree_bset_stats(struct btree_op *op, struct btree *b) +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; - stats->nodes++; - for (i = 0; i <= b->nsets; i++) { - struct bset_tree *t = &b->sets[i]; + struct bset_tree *t = &b->set[i]; size_t bytes = t->data->keys * sizeof(uint64_t); size_t j; @@ -1217,32 +1328,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) stats->bytes_unwritten += bytes; } } - - return MAP_CONTINUE; -} - -int bch_bset_print_stats(struct cache_set *c, char *buf) -{ - struct bset_stats t; - int ret; - - memset(&t, 0, sizeof(struct bset_stats)); - bch_btree_op_init(&t.op, -1); - - ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); - if (ret < 0) - return ret; - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "unwritten sets: %zu\n" - "written key bytes: %zu\n" - "unwritten key bytes: %zu\n" - "floats: %zu\n" - "failed: %zu\n", - t.nodes, - t.sets_written, t.sets_unwritten, - t.bytes_written, t.bytes_unwritten, - t.floats, t.failed); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1d3c24f9fa0..5f6728d5d4d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,7 +1,11 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include <linux/slab.h> +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "util.h" /* for time_stats */ /* * BKEYS: @@ -142,20 +146,13 @@ * first key in that range of bytes again. */ -/* Btree key comparison/iteration */ +struct btree_keys; +struct btree_iter; +struct btree_iter_set; +struct bkey_float; #define MAX_BSETS 4U -struct btree_iter { - size_t size, used; -#ifdef CONFIG_BCACHE_DEBUG - struct btree *b; -#endif - struct btree_iter_set { - struct bkey *k, *end; - } data[MAX_BSETS]; -}; - struct bset_tree { /* * We construct a binary tree in an array as if the array @@ -165,14 +162,14 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned extra; /* copy of the last key in the set */ - struct bkey end; - struct bkey_float *tree; + struct bkey end; + struct bkey_float *tree; /* * The nodes in the bset tree point to specific keys - this @@ -182,12 +179,219 @@ struct bset_tree { * to keep bkey_float to 4 bytes and prev isn't used in the fast * path. */ - uint8_t *prev; + uint8_t *prev; /* The actual btree node, with pointers to each sorted set */ - struct bset *data; + struct bset *data; }; +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*insert_fixup)(struct btree_keys *, struct bkey *, + struct btree_iter *, struct bkey *); + bool (*key_invalid)(struct btree_keys *, + const struct bkey *); + bool (*key_bad)(struct btree_keys *, const struct bkey *); + bool (*key_merge)(struct btree_keys *, + struct bkey *, struct bkey *); + void (*key_to_text)(char *, size_t, const struct bkey *); + void (*key_dump)(struct btree_keys *, const struct bkey *); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) +{ + return b->set + b->nsets; +} + +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) +{ + return t <= b->set + b->nsets - b->last_set_unwritten; +} + +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) +{ + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; +} + +static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +{ + return ((size_t) i) - ((size_t) b->set->data); +} + +static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +{ + return bset_byte_offset(b, i) >> 9; +} + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) + +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); +} + +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned block_bytes) +{ + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); +} + +void bch_btree_keys_free(struct btree_keys *); +int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); +void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, + bool *); + +void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); +void bch_bset_build_written_tree(struct btree_keys *); +void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); +void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); +unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, + struct bkey *); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + +/* Btree key iteration */ + +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree_keys *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); + +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree_keys *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, + struct bkey *); + +struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, + const struct bkey *); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Sorting */ + +struct bset_sort_state { + mempool_t *pool; + + unsigned page_order; + unsigned crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *); +int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); +void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, + struct bset_sort_state *); +void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, + struct bset_sort_state *); +void bch_btree_sort_partial(struct btree_keys *, unsigned, + struct bset_sort_state *); + +static inline void bch_btree_sort(struct btree_keys *b, + struct bset_sort_state *state) +{ + bch_btree_sort_partial(b, 0, state); +} + +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); + +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + static __always_inline int64_t bkey_cmp(const struct bkey *l, const struct bkey *r) { @@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } +void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, + unsigned); +bool __bch_cut_front(const struct bkey *, struct bkey *); +bool __bch_cut_back(const struct bkey *, struct bkey *); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) +{ + return b->ops->key_to_text(buf, size, k); +} + +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(l)); +} + /* Keylists */ struct keylist { @@ -218,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l) l->top_p = l->keys_p = l->inline_keys; } +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} + static inline void bch_keylist_push(struct keylist *l) { l->top = bkey_next(l->top); @@ -257,136 +523,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l) struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); -int bch_keylist_realloc(struct keylist *, int, struct cache_set *); - -void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, - unsigned); -bool __bch_cut_front(const struct bkey *, struct bkey *); -bool __bch_cut_back(const struct bkey *, struct bkey *); - -static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, k) > 0); - return __bch_cut_front(where, k); -} - -static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); - return __bch_cut_back(where, k); -} - -const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); -bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); - -bool bch_ptr_bad(struct btree *, const struct bkey *); - -static inline uint8_t gen_after(uint8_t a, uint8_t b) -{ - uint8_t r = a - b; - return r > 128U ? 0 : r; -} - -static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); -} - -static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); -} - +int __bch_keylist_realloc(struct keylist *, unsigned); -typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); - -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree *, ptr_filter_fn); - -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, - struct bkey *, struct bset_tree *); - -/* 32 bits total: */ -#define BKEY_MID_BITS 3 -#define BKEY_EXPONENT_BITS 7 -#define BKEY_MANTISSA_BITS 22 -#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) - -struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; -} __packed; - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ +/* Debug stuff */ -#define BSET_CACHELINE 128 -#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) +#ifdef CONFIG_BCACHE_DEBUG -#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) -#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) +int __bch_count_data(struct btree_keys *); +void __bch_check_keys(struct btree_keys *, const char *, ...); +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bucket(struct btree_keys *); -void bch_bset_init_next(struct btree *); +#else -void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); -void bch_bset_fix_lookup_table(struct btree *, struct bkey *); +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); -struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, - const struct bkey *); +#endif -/* - * Returns the first key that is strictly greater than search - */ -static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, - const struct bkey *search) +static inline bool btree_keys_expensive_checks(struct btree_keys *b) { - return search ? __bch_bset_search(b, t, search) : t->data->start; +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif } -#define PRECEDING_KEY(_k) \ -({ \ - struct bkey *_ret = NULL; \ - \ - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ - \ - if (!_ret->low) \ - _ret->high--; \ - _ret->low--; \ - } \ - \ - _ret; \ -}) - -bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); -void bch_btree_sort_lazy(struct btree *); -void bch_btree_sort_into(struct btree *, struct btree *); -void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); -void bch_btree_sort_partial(struct btree *, unsigned); - -static inline void bch_btree_sort(struct btree *b) +static inline int bch_count_data(struct btree_keys *b) { - bch_btree_sort_partial(b, 0); + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; } -int bch_bset_print_stats(struct cache_set *, char *); +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 8e2573a009f..7347b610096 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,7 +23,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "writeback.h" +#include "extents.h" #include <linux/slab.h> #include <linux/bitops.h> @@ -68,15 +68,11 @@ * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * - * Make sure all allocations get charged to the root cgroup - * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * - * Also lookup by cgroup in get_open_bucket() - * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight @@ -89,13 +85,6 @@ * Test module load/unload */ -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, -}; - #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -104,16 +93,6 @@ enum { #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -static struct workqueue_struct *btree_io_wq; - -static inline bool should_split(struct btree *b) -{ - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, b->c) - > btree_blocks(b)); -} - #define insert_lock(s, b) ((b)->level <= (s)->lock) /* @@ -138,7 +117,7 @@ static inline bool should_split(struct btree *b) ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -167,20 +146,34 @@ static inline bool should_split(struct btree *b) _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ if (_r == -EINTR) \ schedule(); \ - bch_cannibalize_unlock(c); \ - if (_r == -ENOSPC) { \ - wait_event((c)->try_wait, \ - !(c)->try_harder); \ - _r = -EINTR; \ - } \ } while (_r == -EINTR); \ \ - finish_wait(&(c)->bucket_wait, &(op)->wait); \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ _r; \ }) +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); + +} + /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -197,16 +190,16 @@ void bkey_put(struct cache_set *c, struct bkey *k) static uint64_t btree_csum_set(struct btree *b, struct bset *i) { uint64_t crc = b->key.ptr[0]; - void *data = (void *) i + 8, *end = end(i); + void *data = (void *) i + 8, *end = bset_bkey_last(i); crc = bch_crc64_update(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } -static void bch_btree_node_read_done(struct btree *b) +void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; - struct bset *i = b->sets[0].data; + struct bset *i = btree_bset_first(b); struct btree_iter *iter; iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); @@ -214,21 +207,22 @@ static void bch_btree_node_read_done(struct btree *b) iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->b = &b->keys; #endif if (!i->seq) goto err; for (; - b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; i = write_block(b)) { err = "unsupported bset version"; if (i->version > BCACHE_BSET_VERSION) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + if (b->written + set_blocks(i, block_bytes(b->c)) > + btree_blocks(b)) goto err; err = "bad magic"; @@ -248,31 +242,32 @@ static void bch_btree_node_read_done(struct btree *b) } err = "empty set"; - if (i != b->sets[0].data && !i->keys) + if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(iter, i->start, end(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, b->c); + b->written += set_blocks(i, block_bytes(b->c)); } err = "corrupted btree"; for (i = write_block(b); - bset_sector_offset(b, i) < KEY_SIZE(&b->key); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c)) - if (i->seq == b->sets[0].data->seq) + if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(b, iter); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); - i = b->sets[0].data; + i = b->keys.set[0].data; err = "short btree key"; - if (b->sets[0].size && - bkey_cmp(&b->key, &b->sets[0].end) < 0) + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) goto err; if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); out: mempool_free(iter, b->c->fill_iter); return; @@ -290,7 +285,7 @@ static void btree_node_read_endio(struct bio *bio, int error) closure_put(cl); } -void bch_btree_node_read(struct btree *b) +static void bch_btree_node_read(struct btree *b) { uint64_t start_time = local_clock(); struct closure cl; @@ -306,7 +301,7 @@ void bch_btree_node_read(struct btree *b) bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bch_bio_map(bio, b->sets[0].data); + bch_bio_map(bio, b->keys.set[0].data); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -360,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); + schedule_delayed_work(&b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } @@ -393,7 +387,7 @@ static void btree_node_write_endio(struct bio *bio, int error) static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io; - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); BKEY_PADDED(key) k; i->version = BCACHE_BSET_VERSION; @@ -405,7 +399,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; - b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); bch_bio_map(b->bio, i); /* @@ -424,7 +418,8 @@ static void do_btree_node_write(struct btree *b) */ bkey_copy(&k.key, &b->key); - SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; @@ -449,17 +444,19 @@ static void do_btree_node_write(struct btree *b) } } -void bch_btree_node_write(struct btree *b, struct closure *parent) +void __bch_btree_node_write(struct btree *b, struct closure *parent) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); + + lockdep_assert_held(&b->write_lock); trace_bcache_btree_write(b); BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); - BUG_ON(b->sets->data->seq != i->seq); - bch_check_keys(b, "writing"); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); cancel_delayed_work(&b->work); @@ -472,14 +469,28 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - b->written += set_blocks(i, b->c); - atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - bch_btree_sort_lazy(b); + b->written += set_blocks(i, block_bytes(b->c)); +} - if (b->written < btree_blocks(b)) - bch_bset_init_next(b); +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (nsets && !b->keys.nsets) + bch_btree_verify(b); + + bch_btree_init_next(b); } static void bch_btree_node_write_sync(struct btree *b) @@ -487,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b) struct closure cl; closure_init_stack(&cl); + + mutex_lock(&b->write_lock); bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + closure_sync(&cl); } @@ -495,23 +510,24 @@ static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); + lockdep_assert_held(&b->write_lock); + BUG_ON(!b->written); BUG_ON(!i->keys); if (!btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + schedule_delayed_work(&b->work, 30 * HZ); set_btree_node_dirty(b); @@ -539,54 +555,19 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -static void mca_reinit(struct btree *b) -{ - unsigned i; - - b->flags = 0; - b->written = 0; - b->nsets = 0; - - for (i = 0; i < MAX_BSETS; i++) - b->sets[i].size = 0; - /* - * Second loop starts at 1 because b->sets[0]->data is the memory we - * allocated - */ - for (i = 1; i < MAX_BSETS; i++) - b->sets[i].data = NULL; -} - #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ - max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) static void mca_data_free(struct btree *b) { - struct bset_tree *t = b->sets; - BUG_ON(b->io_mutex.count != 1); - if (bset_prev_bytes(b) < PAGE_SIZE) - kfree(t->prev); - else - free_pages((unsigned long) t->prev, - get_order(bset_prev_bytes(b))); - - if (bset_tree_bytes(b) < PAGE_SIZE) - kfree(t->tree); - else - free_pages((unsigned long) t->tree, - get_order(bset_tree_bytes(b))); + bch_btree_keys_free(&b->keys); - free_pages((unsigned long) t->data, b->page_order); - - t->prev = NULL; - t->tree = NULL; - t->data = NULL; + b->c->btree_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); - b->c->bucket_cache_used--; } static void mca_bucket_free(struct btree *b) @@ -605,34 +586,16 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { - struct bset_tree *t = b->sets; - BUG_ON(t->data); - - b->page_order = max_t(unsigned, - ilog2(b->c->btree_pages), - btree_order(k)); - - t->data = (void *) __get_free_pages(gfp, b->page_order); - if (!t->data) - goto err; - - t->tree = bset_tree_bytes(b) < PAGE_SIZE - ? kmalloc(bset_tree_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); - if (!t->tree) - goto err; - - t->prev = bset_prev_bytes(b) < PAGE_SIZE - ? kmalloc(bset_prev_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); - if (!t->prev) - goto err; - - list_move(&b->list, &b->c->btree_cache); - b->c->bucket_cache_used++; - return; -err: - mca_data_free(b); + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } } static struct btree *mca_bucket_alloc(struct cache_set *c, @@ -644,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; @@ -663,9 +628,9 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) if (!down_write_trylock(&b->lock)) return -ENOMEM; - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); - if (b->page_order < min_order) + if (b->keys.page_order < min_order) goto out_unlock; if (!flush) { @@ -677,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) up(&b->io_mutex); } + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write_sync(b); + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); /* wait for any in flight btree write */ down(&b->io_mutex); @@ -701,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, if (c->shrinker_disabled) return SHRINK_STOP; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -733,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } } - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { + for (i = 0; (nr--) && i < c->btree_cache_used; i++) { if (list_empty(&c->btree_cache)) goto out; @@ -762,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, if (c->shrinker_disabled) return 0; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return 0; return mca_can_free(c) * c->btree_pages; @@ -782,6 +751,8 @@ void bch_btree_cache_free(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); #endif list_splice(&c->btree_cache_freeable, @@ -822,10 +793,13 @@ int bch_btree_cache_alloc(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG mutex_init(&c->verify_lock); + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && - c->verify_data->sets[0].data) + c->verify_data->keys.set->data) list_del_init(&c->verify_data->list); else c->verify_data = NULL; @@ -861,17 +835,30 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + struct task_struct *old; + + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old && old != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + return -EINTR; + } + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) { struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!c->try_harder) { - c->try_harder = current; - c->try_harder_start = local_clock(); - } else if (c->try_harder != current) - return ERR_PTR(-ENOSPC); + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); list_for_each_entry_reverse(b, &c->btree_cache, list) if (!mca_reap(b, btree_order(k), false)) @@ -881,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) if (!mca_reap(b, btree_order(k), true)) return b; + WARN(1, "btree cache cannibalize failed\n"); return ERR_PTR(-ENOMEM); } @@ -892,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) */ static void bch_cannibalize_unlock(struct cache_set *c) { - if (c->try_harder == current) { - bch_time_stats_update(&c->try_harder_time, c->try_harder_start); - c->try_harder = NULL; - wake_up(&c->try_wait); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) { struct btree *b; @@ -923,7 +911,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) list_for_each_entry(b, &c->btree_cache_freed, list) if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); - if (!b->sets[0].data) + if (!b->keys.set[0].data) goto err; else goto out; @@ -934,7 +922,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) goto err; BUG_ON(!down_write_trylock(&b->lock)); - if (!b->sets->data) + if (!b->keys.set->data) goto err; out: BUG_ON(b->io_mutex.count != 1); @@ -945,17 +933,24 @@ out: hlist_add_head_rcu(&b->hash, mca_hash(c, k)); lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); - b->level = level; b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; - mca_reinit(b); + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); return b; err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k); + b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out; @@ -971,8 +966,8 @@ err: * The btree node will have either a read or a write lock held, depending on * level and op->lock. */ -struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, bool write) +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write) { int i = 0; struct btree *b; @@ -986,7 +981,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, op, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -1009,13 +1004,13 @@ retry: b->accessed = 1; - for (; i <= b->nsets && b->sets[i].size; i++) { - prefetch(b->sets[i].tree); - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); } - for (; i <= b->nsets; i++) - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); if (btree_node_io_error(b)) { rw_unlock(write, b); @@ -1032,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, NULL, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -1045,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) static void btree_node_free(struct btree *b) { - unsigned i; - trace_bcache_btree_node_free(b); BUG_ON(b == b->c->root); + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); + mutex_unlock(&b->write_lock); + cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); - - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), - PTR_BUCKET(b->c, &b->key, i)); - } - bch_bucket_free(b->c, &b->key); mca_bucket_free(b); mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) goto err; bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level); + b = mca_alloc(c, op, &k.key, level); if (IS_ERR(b)) goto err_free; @@ -1095,7 +1085,7 @@ retry: } b->accessed = 1; - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1110,11 +1100,16 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct btree_op *op) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); - if (!IS_ERR_OR_NULL(n)) - bch_btree_sort_into(b, n); + struct btree *n = bch_btree_node_alloc(b->c, op, b->level); + if (!IS_ERR_OR_NULL(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); + } return n; } @@ -1123,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) { unsigned i; + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + bkey_copy(k, &b->key); bkey_copy_key(k, &ZERO_KEY); - for (i = 0; i < KEY_PTRS(k); i++) { - uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; - - SET_PTR_GEN(k, i, g); - } + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i))); - atomic_inc(&b->c->prio_blocked); + mutex_unlock(&b->c->bucket_lock); } static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; struct cache *ca; - unsigned i, reserve = c->root->level * 2 + 1; - int ret = 0; + unsigned i, reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { if (op) - prepare_to_wait(&c->bucket_wait, &op->wait, + prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); - ret = -EINTR; - break; + mutex_unlock(&c->bucket_lock); + return -EINTR; } mutex_unlock(&c->bucket_lock); - return ret; + + return mca_cannibalize_lock(b->c, op); } /* Garbage collection */ -uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) { uint8_t stale = 0; unsigned i; @@ -1179,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) g = PTR_BUCKET(c, k, i); - if (gen_after(g->gc_gen, PTR_GEN(k, i))) - g->gc_gen = PTR_GEN(k, i); + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); if (ptr_stale(c, k, i)) { stale = max(stale, ptr_stale(c, k, i)); @@ -1196,11 +1195,13 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) SET_GC_MARK(g, GC_MARK_METADATA); else if (KEY_DIRTY(k)) SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned, GC_SECTORS_USED(g) + KEY_SIZE(k), - (1 << 14) - 1)); + MAX_GC_SECTORS_USED)); BUG_ON(!GC_SECTORS_USED(g)); } @@ -1210,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1220,11 +1241,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->nodes++; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { stale = max(stale, btree_mark_key(b, k)); keys++; - if (bch_ptr_bad(b, k)) + if (bch_ptr_bad(&b->keys, k)) continue; gc->key_bytes += bkey_u64s(k); @@ -1234,9 +1255,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->data += KEY_SIZE(k); } - for (t = b->sets; t <= &b->sets[b->nsets]; t++) + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) btree_bug_on(t->size && - bset_written(b, t) && + bset_written(&b->keys, t) && bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); @@ -1263,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *, struct keylist *, atomic_t *, struct bkey *); static int btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct keylist *keylist, struct gc_stat *gc, - struct gc_merge_info *r) + struct gc_stat *gc, struct gc_merge_info *r) { unsigned i, nodes = 0, keys = 0, blocks; struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; struct closure cl; struct bkey *k; + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); @@ -1280,28 +1306,42 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || - __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { - new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); if (IS_ERR_OR_NULL(new_nodes[i])) goto out_nocoalesce; } + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = new_nodes[i]->sets->data; - struct bset *n2 = new_nodes[i - 1]->sets->data; + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL; keys = 0; if (i > 1) { for (k = n2->start; - k < end(n2); + k < bset_bkey_last(n2); k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) + bkey_u64s(k), + block_bytes(b->c)) > blocks) break; last = k; @@ -1317,7 +1357,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - b->c) > btree_blocks(new_nodes[i])) + block_bytes(b->c)) > + btree_blocks(new_nodes[i])) goto out_nocoalesce; keys = n2->keys; @@ -1325,47 +1366,54 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(new_nodes[i])); + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + btree_blocks(new_nodes[i])); if (last) bkey_copy_key(&new_nodes[i]->key, last); - memcpy(end(n1), + memcpy(bset_bkey_last(n1), n2->start, - (void *) node(n2, keys) - (void *) n2->start); + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); n1->keys += keys; r[i].keys = n1->keys; memmove(n2->start, - node(n2, keys), - (void *) end(n2) - (void *) node(n2, keys)); + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); n2->keys -= keys; - if (bch_keylist_realloc(keylist, - KEY_PTRS(&new_nodes[i]->key), b->c)) + if (__bch_keylist_realloc(&keylist, + bkey_u64s(&new_nodes[i]->key))) goto out_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); - bch_keylist_add(keylist, &new_nodes[i]->key); + bch_keylist_add(&keylist, &new_nodes[i]->key); } - for (i = 0; i < nodes; i++) { - if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) - goto out_nocoalesce; + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); - make_btree_freeing_key(r[i].b, keylist->top); - bch_keylist_push(keylist); - } + closure_sync(&cl); /* We emptied out this node */ - BUG_ON(new_nodes[0]->sets->data->keys); + BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); - closure_sync(&cl); + for (i = 0; i < nodes; i++) { + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) + goto out_nocoalesce; + + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); + } + + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); for (i = 0; i < nodes; i++) { btree_node_free(r[i].b); @@ -1374,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, r[i].b = new_nodes[i]; } - bch_btree_insert_node(b, op, keylist, NULL, NULL); - BUG_ON(!bch_keylist_empty(keylist)); - memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); r[nodes - 1].b = ERR_PTR(-EINTR); trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; + bch_keylist_free(&keylist); + /* Invalidated our iterator */ return -EINTR; out_nocoalesce: closure_sync(&cl); + bch_keylist_free(&keylist); - while ((k = bch_keylist_pop(keylist))) + while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); @@ -1401,13 +1449,49 @@ out_nocoalesce: return 0; } +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) +{ + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; + + n = btree_node_alloc_replacement(replace, NULL); + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; + } + + bch_btree_node_write_sync(n); + + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); + + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); + + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); + + btree_node_free(replace); + rw_unlock(true, n); + + /* Invalidated our iterator */ + return -EINTR; +} + static unsigned btree_gc_count_keys(struct btree *b) { struct bkey *k; struct btree_iter iter; unsigned ret = 0; - for_each_key_filter(b, k, &iter, bch_ptr_bad) + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); return ret; @@ -1416,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b) static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { - unsigned i; int ret = 0; bool should_rewrite; - struct btree *n; struct bkey *k; - struct keylist keys; struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; - struct gc_merge_info *last = r + GC_MERGE_NODES - 1; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_keylist_init(&keys); - bch_btree_iter_init(b, &iter, &b->c->gc_done); + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); - for (i = 0; i < GC_MERGE_NODES; i++) - r[i].b = ERR_PTR(-EINTR); + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { - r->b = bch_btree_node_get(b->c, k, b->level - 1, true); + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); break; @@ -1443,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, r->keys = btree_gc_count_keys(r->b); - ret = btree_gc_coalesce(b, op, &keys, gc, r); + ret = btree_gc_coalesce(b, op, gc, r); if (ret) break; } @@ -1453,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); - if (should_rewrite && - !btree_check_reserve(b, NULL)) { - n = btree_node_alloc_replacement(last->b, - false); - - if (!IS_ERR_OR_NULL(n)) { - bch_btree_node_write_sync(n); - bch_keylist_add(&keys, &n->key); - - make_btree_freeing_key(last->b, - keys.top); - bch_keylist_push(&keys); - - btree_node_free(last->b); - - bch_btree_insert_node(b, op, &keys, - NULL, NULL); - BUG_ON(!bch_keylist_empty(&keys)); - - rw_unlock(true, last->b); - last->b = n; - - /* Invalidated our iterator */ - ret = -EINTR; + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) break; - } } if (last->b->level) { @@ -1493,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, * Must flush leaf nodes before gc ends, since replace * operations aren't journalled */ + mutex_lock(&last->b->write_lock); if (btree_node_dirty(last->b)) bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); rw_unlock(true, last->b); } @@ -1507,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, } } - for (i = 0; i < GC_MERGE_NODES; i++) - if (!IS_ERR_OR_NULL(r[i].b)) { - if (btree_node_dirty(r[i].b)) - bch_btree_node_write(r[i].b, writes); - rw_unlock(true, r[i].b); + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); } - bch_keylist_free(&keys); - return ret; } @@ -1528,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, should_rewrite = btree_gc_mark_node(b, gc); if (should_rewrite) { - n = btree_node_alloc_replacement(b, false); + n = btree_node_alloc_replacement(b, NULL); if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); + bch_btree_set_root(n); btree_node_free(b); rw_unlock(true, n); @@ -1540,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, } } + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + if (b->level) { ret = btree_gc_recurse(b, op, writes, gc); if (ret) @@ -1567,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { - b->gc_gen = b->gen; + b->last_gc = b->gen; if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } @@ -1577,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -size_t bch_btree_gc_finish(struct cache_set *c) +static size_t bch_btree_gc_finish(struct cache_set *c) { size_t available = 0; struct bucket *b; @@ -1590,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) c->gc_mark_valid = 1; c->need_gc = 0; - if (c->root) - for (i = 0; i < KEY_PTRS(&c->root->key); i++) - SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), - GC_MARK_METADATA); - for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); @@ -1634,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); for_each_bucket(b, ca) { - b->last_gc = b->gc_gen; c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (!atomic_read(&b->pin) && - GC_MARK(b) == GC_MARK_RECLAIMABLE) { + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) available++; - if (!GC_SECTORS_USED(b)) - bch_bucket_add_unused(ca, b); - } } } @@ -1734,313 +1793,113 @@ int bch_gc_thread_start(struct cache_set *c) /* Initial partial gc */ -static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, - unsigned long **seen) +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; - unsigned i; struct bkey *k, *p = NULL; - struct bucket *g; struct btree_iter iter; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - for (i = 0; i < KEY_PTRS(k); i++) { - if (!ptr_available(b->c, k, i)) - continue; - - g = PTR_BUCKET(b->c, k, i); - - if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), - seen[PTR_DEV(k, i)]) || - !ptr_stale(b->c, k, i)) { - g->gen = PTR_GEN(k, i); + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); - if (b->level) - g->prio = BTREE_PRIO; - else if (g->prio == BTREE_PRIO) - g->prio = INITIAL_PRIO; - } - } - - btree_mark_key(b, k); - } + bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); if (k) btree_node_prefetch(b->c, k, b->level - 1); if (p) - ret = btree(check_recurse, p, b, op, seen); + ret = btree(check_recurse, p, b, op); p = k; } while (p && !ret); } - return 0; + return ret; } int bch_btree_check(struct cache_set *c) { - int ret = -ENOMEM; - unsigned i; - unsigned long *seen[MAX_CACHES_PER_SET]; struct btree_op op; - memset(seen, 0, sizeof(seen)); bch_btree_op_init(&op, SHRT_MAX); - for (i = 0; c->cache[i]; i++) { - size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); - seen[i] = kmalloc(n, GFP_KERNEL); - if (!seen[i]) - goto err; - - /* Disables the seen array until prio_read() uses it too */ - memset(seen[i], 0xFF, n); - } - - ret = btree_root(check_recurse, c, &op, seen); -err: - for (i = 0; i < MAX_CACHES_PER_SET; i++) - kfree(seen[i]); - return ret; -} - -/* Btree insertion */ - -static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) -{ - struct bset *i = b->sets[b->nsets].data; - - memmove((uint64_t *) where + bkey_u64s(insert), - where, - (void *) end(i) - (void *) where); - - i->keys += bkey_u64s(insert); - bkey_copy(where, insert); - bch_bset_fix_lookup_table(b, where); + return btree_root(check_recurse, c, &op); } -static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, - struct btree_iter *iter, - struct bkey *replace_key) +void bch_initial_gc_finish(struct cache_set *c) { - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) - { - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - offset, -sectors); - } - - uint64_t old_offset; - unsigned old_size, sectors_found = 0; - - while (1) { - struct bkey *k = bch_btree_iter_next(iter); - if (!k || - bkey_cmp(&START_KEY(k), insert) >= 0) - break; - - if (bkey_cmp(k, &START_KEY(insert)) <= 0) - continue; - - old_offset = KEY_START(k); - old_size = KEY_SIZE(k); - - /* - * We might overlap with 0 size extents; we can't skip these - * because if they're in the set we're inserting to we have to - * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for replace - * operations. - */ - - if (replace_key && KEY_SIZE(k)) { - /* - * k might have been split since we inserted/found the - * key we're replacing - */ - unsigned i; - uint64_t offset = KEY_START(k) - - KEY_START(replace_key); - - /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(replace_key) || - KEY_OFFSET(k) > KEY_OFFSET(replace_key)) - goto check_failed; - - /* We didn't find a key that we were supposed to */ - if (KEY_START(k) > KEY_START(insert) + sectors_found) - goto check_failed; - - if (KEY_PTRS(k) != KEY_PTRS(replace_key) || - KEY_DIRTY(k) != KEY_DIRTY(replace_key)) - goto check_failed; - - /* skip past gen */ - offset <<= 8; - - BUG_ON(!KEY_PTRS(replace_key)); - - for (i = 0; i < KEY_PTRS(replace_key); i++) - if (k->ptr[i] != replace_key->ptr[i] + offset) - goto check_failed; - - sectors_found = KEY_OFFSET(k) - KEY_START(insert); - } + struct cache *ca; + struct bucket *b; + unsigned i; - if (bkey_cmp(insert, k) < 0 && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { - /* - * We overlapped in the middle of an existing key: that - * means we have to split the old key. But we have to do - * slightly different things depending on whether the - * old key has been written out yet. - */ + bch_btree_gc_finish(c); - struct bkey *top; - - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - - if (bkey_written(b, k)) { - /* - * We insert a new key to cover the top of the - * old key, and the old key is modified in place - * to represent the bottom split. - * - * It's completely arbitrary whether the new key - * is the top or the bottom, but it has to match - * up with what btree_sort_fixup() does - it - * doesn't check for this kind of overlap, it - * depends on us inserting a new key for the top - * here. - */ - top = bch_bset_search(b, &b->sets[b->nsets], - insert); - shift_keys(b, top, k); - } else { - BKEY_PADDED(key) temp; - bkey_copy(&temp.key, k); - shift_keys(b, k, &temp.key); - top = bkey_next(k); - } + mutex_lock(&c->bucket_lock); - bch_cut_front(insert, top); - bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); - return false; - } + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_cache(ca, c, i) { + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO])) + break; - if (bkey_cmp(insert, k) < 0) { - bch_cut_front(insert, k); - } else { - if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) - old_offset = KEY_START(insert); - - if (bkey_written(b, k) && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { - /* - * Completely overwrote, so we don't have to - * invalidate the binary search tree - */ - bch_cut_front(k, k); - } else { - __bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets); } } - - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); - } - -check_failed: - if (replace_key) { - if (!sectors_found) { - return true; - } else if (sectors_found < KEY_SIZE(insert)) { - SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - - (KEY_SIZE(insert) - sectors_found)); - SET_KEY_SIZE(insert, sectors_found); - } } - return false; + mutex_unlock(&c->bucket_lock); } -static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k, struct bkey *replace_key) +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) { - struct bset *i = b->sets[b->nsets].data; - struct bkey *m, *prev; - unsigned status = BTREE_INSERT_STATUS_INSERT; + unsigned status; BUG_ON(bkey_cmp(k, &b->key) > 0); - BUG_ON(b->level && !KEY_PTRS(k)); - BUG_ON(!b->level && !KEY_OFFSET(k)); - - if (!b->level) { - struct btree_iter iter; - - /* - * bset_search() returns the first key that is strictly greater - * than the search key - but for back merging, we want to find - * the previous key. - */ - prev = NULL; - m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); - - if (fix_overlapping_extents(b, k, &iter, replace_key)) { - op->insert_collision = true; - return false; - } - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - while (m != end(i) && - bkey_cmp(k, &START_KEY(m)) > 0) - prev = m, m = bkey_next(m); + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); - if (key_merging_disabled(b->c)) - goto insert; - - /* prev is in the tree, if we merge we're done */ - status = BTREE_INSERT_STATUS_BACK_MERGE; - if (prev && - bch_bkey_try_merge(b, prev, k)) - goto merged; - - status = BTREE_INSERT_STATUS_OVERWROTE; - if (m != end(i) && - KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) - goto copy; - - status = BTREE_INSERT_STATUS_FRONT_MERGE; - if (m != end(i) && - bch_bkey_try_merge(b, k, m)) - goto copy; - } else { - BUG_ON(replace_key); - m = bch_bset_search(b, &b->sets[b->nsets], k); - } - -insert: shift_keys(b, m, k); -copy: bkey_copy(m, k); -merged: - bch_check_keys(b, "%u for %s", status, - replace_key ? "replace" : "insert"); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; +} - if (b->level && !KEY_OFFSET(k)) - btree_current_write(b)->prio_blocked++; +static size_t insert_u64s_remaining(struct btree *b) +{ + long ret = bch_btree_keys_u64s_remaining(&b->keys); - trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; - return true; + return max(ret, 0L); } static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, @@ -2048,21 +1907,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bkey *replace_key) { bool ret = false; - int oldsize = bch_count_data(b); + int oldsize = bch_count_data(&b->keys); while (!bch_keylist_empty(insert_keys)) { - struct bset *i = write_block(b); struct bkey *k = insert_keys->keys; - if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) - > btree_blocks(b)) + if (bkey_u64s(k) > insert_u64s_remaining(b)) break; if (bkey_cmp(k, &b->key) <= 0) { if (!b->level) bkey_put(b->c, k); - ret |= btree_insert_key(b, op, k, replace_key); + ret |= btree_insert_key(b, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { BKEY_PADDED(key) temp; @@ -2071,16 +1928,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bch_cut_back(&b->key, &temp.key); bch_cut_front(&b->key, insert_keys->keys); - ret |= btree_insert_key(b, op, &temp.key, replace_key); + ret |= btree_insert_key(b, &temp.key, replace_key); break; } else { break; } } + if (!ret) + op->insert_collision = true; + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); - BUG_ON(bch_count_data(b) < oldsize); + BUG_ON(bch_count_data(&b->keys) < oldsize); return ret; } @@ -2097,31 +1957,38 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_init_stack(&cl); bch_keylist_init(&parent_keys); - if (!b->level && - btree_check_reserve(b, op)) - return -EINTR; + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } - n1 = btree_node_alloc_replacement(b, true); + n1 = btree_node_alloc_replacement(b, op); if (IS_ERR(n1)) goto err; - split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned keys = 0; - trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); - n2 = bch_btree_node_alloc(b->c, b->level, true); + n2 = bch_btree_node_alloc(b->c, op, b->level); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, true); + n3 = bch_btree_node_alloc(b->c, op, b->level + 1); if (IS_ERR(n3)) goto err_free2; } + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); /* @@ -2129,62 +1996,64 @@ static int btree_split(struct btree *b, struct btree_op *op, * search tree yet */ - while (keys < (n1->sets[0].data->keys * 3) / 5) - keys += bkey_u64s(node(n1->sets[0].data, keys)); + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); - bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); - keys += bkey_u64s(node(n1->sets[0].data, keys)); + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); - n2->sets[0].data->keys = n1->sets[0].data->keys - keys; - n1->sets[0].data->keys = keys; + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; - memcpy(n2->sets[0].data->start, - end(n1->sets[0].data), - n2->sets[0].data->keys * sizeof(uint64_t)); + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&parent_keys, &n2->key); bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); rw_unlock(true, n2); } else { - trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); + mutex_lock(&n1->write_lock); bch_btree_insert_keys(n1, op, insert_keys, replace_key); } bch_keylist_add(&parent_keys, &n1->key); bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); if (n3) { /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); - - btree_node_free(b); } else if (!b->parent) { /* Root filled up but didn't need to be split */ closure_sync(&cl); bch_btree_set_root(n1); - - btree_node_free(b); } else { /* Split a non root node */ closure_sync(&cl); make_btree_freeing_key(b, parent_keys.top); bch_keylist_push(&parent_keys); - btree_node_free(b); - bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); BUG_ON(!bch_keylist_empty(&parent_keys)); } + btree_node_free(b); rw_unlock(true, n1); bch_time_stats_update(&b->c->btree_split_time, start_time); @@ -2199,7 +2068,7 @@ err_free1: btree_node_free(n1); rw_unlock(true, n1); err: - WARN(1, "bcache: btree split failed"); + WARN(1, "bcache: btree split failed (level %u)", b->level); if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || @@ -2214,31 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, atomic_t *journal_ref, struct bkey *replace_key) { + struct closure cl; + BUG_ON(b->level && replace_key); - if (should_split(b)) { - if (current->bio_list) { - op->lock = b->c->root->level + 1; - return -EAGAIN; - } else if (op->lock <= b->c->root->level) { - op->lock = b->c->root->level + 1; - return -EINTR; - } else { - /* Invalidated all iterators */ - return btree_split(b, op, insert_keys, replace_key) ?: - -EINTR; - } - } else { - BUG_ON(write_block(b) != b->sets[b->nsets].data); + closure_init_stack(&cl); - if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { - if (!b->level) - bch_btree_leaf_dirty(b, journal_ref); - else - bch_btree_node_write_sync(b); - } + mutex_lock(&b->write_lock); - return 0; + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ + + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { + mutex_unlock(&b->write_lock); + goto split; + } + + BUG_ON(write_block(b) != btree_bset_last(b)); + + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } + + mutex_unlock(&b->write_lock); + + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; } } @@ -2368,9 +2260,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2401,9 +2293,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : btree(map_keys_recurse, k, b, op, from, fn, flags); @@ -2624,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } - -void bch_btree_exit(void) -{ - if (btree_io_wq) - destroy_workqueue(btree_io_wq); -} - -int __init bch_btree_init(void) -{ - btree_io_wq = create_singlethread_workqueue("bch_btree_io"); - if (!btree_io_wq) - return -ENOMEM; - - return 0; -} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 12c99b1a764..91dfa5e6968 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -127,20 +127,13 @@ struct btree { struct cache_set *c; struct btree *parent; + struct mutex write_lock; + unsigned long flags; uint16_t written; /* would be nice to kill */ uint8_t level; - uint8_t nsets; - uint8_t page_order; - - /* - * Set of sorted keys - the real btree node - plus a binary search tree - * - * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree sets[MAX_BSETS]; + + struct btree_keys keys; /* For outstanding btree writes, used as a lock - protects write_idx */ struct closure io; @@ -180,44 +173,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) return b->writes + (btree_node_write_idx(b) ^ 1); } -static inline unsigned bset_offset(struct btree *b, struct bset *i) -{ - return (((size_t) i) - ((size_t) b->sets->data)) >> 9; -} - static inline struct bset *btree_bset_first(struct btree *b) { - return b->sets->data; + return b->keys.set->data; } -static inline unsigned bset_byte_offset(struct btree *b, struct bset *i) +static inline struct bset *btree_bset_last(struct btree *b) { - return ((size_t) i) - ((size_t) b->sets->data); -} - -static inline unsigned bset_sector_offset(struct btree *b, struct bset *i) -{ - return (((void *) i) - ((void *) btree_bset_first(b))) >> 9; + return bset_tree_last(&b->keys)->data; } static inline unsigned bset_block_offset(struct btree *b, struct bset *i) { - return bset_sector_offset(b, i) >> b->c->block_bits; -} - -static inline struct bset *write_block(struct btree *b) -{ - return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); -} - -static inline bool bset_written(struct btree *b, struct bset_tree *t) -{ - return t->data < write_block(b); -} - -static inline bool bkey_written(struct btree *b, struct bkey *k) -{ - return k < write_block(b)->start; + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } static inline void set_gc_sectors(struct cache_set *c) @@ -225,21 +193,6 @@ static inline void set_gc_sectors(struct cache_set *c) atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } -static inline struct bkey *bch_btree_iter_init(struct btree *b, - struct btree_iter *iter, - struct bkey *search) -{ - return __bch_btree_iter_init(b, iter, search, b->sets); -} - -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - if (b->level) - return bch_btree_ptr_invalid(b->c, k); - else - return bch_extent_ptr_invalid(b->c, k); -} - void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ @@ -250,14 +203,6 @@ void bkey_put(struct cache_set *c, struct bkey *k); iter++) \ hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), b, filter));) - -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) - /* Recursing down the btree */ struct btree_op { @@ -292,12 +237,14 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -void bch_btree_node_read(struct btree *); +void bch_btree_node_read_done(struct btree *); +void __bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); +struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, + struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); @@ -305,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *, atomic_t *, struct bkey *); int bch_gc_thread_start(struct cache_set *); -size_t bch_btree_gc_finish(struct cache_set *); +void bch_initial_gc_finish(struct cache_set *); void bch_moving_gc(struct cache_set *); int bch_btree_check(struct cache_set *); -uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 7ef7461912b..a08e3eeac3c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, cl->fn = fn; cl->wq = wq; /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } static inline void closure_queue(struct closure *cl) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 473e8d5a7fe..8b1f1d5c181 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,6 +8,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include <linux/console.h> #include <linux/debugfs.h> @@ -17,147 +18,82 @@ static struct dentry *debug; -const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size) - return "bad, length too big"; - if (bucket < ca->sb.first_bucket) - return "bad, short offset"; - if (bucket >= ca->sb.nbuckets) - return "bad, offset past end of device"; - if (ptr_stale(c, k, i)) - return "stale"; - } - - if (!bkey_cmp(k, &ZERO_KEY)) - return "bad, null key"; - if (!KEY_PTRS(k)) - return "bad, no pointers"; - if (!KEY_SIZE(k)) - return "zeroed key"; - return ""; -} - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - unsigned i = 0; - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); - - if (KEY_PTRS(k)) - while (1) { - p("%llu:%llu gen %llu", - PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); - - if (++i == KEY_PTRS(k)) - break; - - p(", "); - } - - p("]"); - - if (KEY_DIRTY(k)) - p(" dirty"); - if (KEY_CSUM(k)) - p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); -#undef p - return out - buf; -} - #ifdef CONFIG_BCACHE_DEBUG -static void dump_bset(struct btree *b, struct bset *i) -{ - struct bkey *k, *next; - unsigned j; - char buf[80]; - - for (k = i->start; k < end(i); k = next) { - next = bkey_next(k); - - bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "block %u key %zi/%u: %s", bset_block_offset(b, i), - (uint64_t *) k - i->d, i->keys, buf); - - for (j = 0; j < KEY_PTRS(k); j++) { - size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); - - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", - PTR_BUCKET(b->c, k, j)->prio); - } - - printk(" %s\n", bch_ptr_status(b->c, k)); - - if (next < end(i) && - bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) - printk(KERN_ERR "Key skipped backwards\n"); - } -} - -static void bch_dump_bucket(struct btree *b) -{ - unsigned i; - - console_lock(); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - console_unlock(); -} +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ + block_bytes(b->c)) -void bch_btree_verify(struct btree *b, struct bset *new) +void bch_btree_verify(struct btree *b) { struct btree *v = b->c->verify_data; - struct closure cl; - closure_init_stack(&cl); + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; - if (!b->c->verify) + if (!b->c->verify || !b->c->verify_ondisk) return; down(&b->io_mutex); mutex_lock(&b->c->verify_lock); + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; + bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; + v->keys.ops = b->keys.ops; + + bio = bch_bbio_alloc(b->c); + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bch_bio_map(bio, sorted); - bch_btree_node_read(v); + submit_bio_wait(REQ_META|READ_SYNC, bio); + bch_bbio_free(bio, b->c); - if (new->keys != v->sets[0].data->keys || - memcmp(new->start, - v->sets[0].data->start, - (void *) end(new) - (void *) new->start)) { - unsigned i, j; + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->keys.set->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + struct bset *i; + unsigned j; console_lock(); - printk(KERN_ERR "*** original memory node:\n"); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); + printk(KERN_ERR "*** in memory:\n"); + bch_dump_bset(&b->keys, inmemory, 0); - printk(KERN_ERR "*** sorted memory node:\n"); - dump_bset(b, new); + printk(KERN_ERR "*** read back in:\n"); + bch_dump_bset(&v->keys, sorted, 0); - printk(KERN_ERR "*** on disk node:\n"); - dump_bset(v, v->sets[0].data); + for_each_written_bset(b, ondisk, i) { + unsigned block = ((void *) i - (void *) ondisk) / + block_bytes(b->c); + + printk(KERN_ERR "*** on disk block %u:\n", block); + bch_dump_bset(&b->keys, i, block); + } - for (j = 0; j < new->keys; j++) - if (new->d[j] != v->sets[0].data->d[j]) + printk(KERN_ERR "*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c)); + + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) break; + printk(KERN_ERR "b->written %u\n", b->written); + console_unlock(); panic("verify failed at %u\n", j); } @@ -204,74 +140,6 @@ out_put: bio_put(check); } -int __bch_count_data(struct btree *b) -{ - unsigned ret = 0; - struct btree_iter iter; - struct bkey *k; - - if (!b->level) - for_each_key(b, k, &iter) - ret += KEY_SIZE(k); - return ret; -} - -void __bch_check_keys(struct btree *b, const char *fmt, ...) -{ - va_list args; - struct bkey *k, *p = NULL; - struct btree_iter iter; - const char *err; - - for_each_key(b, k, &iter) { - if (!b->level) { - err = "Keys out of order"; - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) - goto bug; - - if (bch_ptr_invalid(b, k)) - continue; - - err = "Overlapping keys"; - if (p && bkey_cmp(p, &START_KEY(k)) > 0) - goto bug; - } else { - if (bch_ptr_bad(b, k)) - continue; - - err = "Duplicate keys"; - if (p && !bkey_cmp(p, k)) - goto bug; - } - p = k; - } - - err = "Key larger than btree node key"; - if (p && bkey_cmp(p, &b->key) > 0) - goto bug; - - return; -bug: - bch_dump_bucket(b); - - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - panic("bcache error: %s:\n", err); -} - -void bch_btree_iter_next_check(struct btree_iter *iter) -{ - struct bkey *k = iter->data->k, *next = bkey_next(k); - - if (next < iter->data->end && - bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { - bch_dump_bucket(iter->b); - panic("Key skipped backwards\n"); - } -} - #endif #ifdef CONFIG_DEBUG_FS @@ -318,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 2ede60e3187..1f63c195d24 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,47 +1,30 @@ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H -/* Btree/bkey debug printing */ - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +struct bio; +struct cached_dev; +struct cache_set; #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *, struct bset *); +void bch_btree_verify(struct btree *); void bch_data_verify(struct cached_dev *, struct bio *); -int __bch_count_data(struct btree *); -void __bch_check_keys(struct btree *, const char *, ...); -void bch_btree_iter_next_check(struct btree_iter *); -#define EBUG_ON(cond) BUG_ON(cond) #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) #define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ -static inline void bch_btree_verify(struct btree *b, struct bset *i) {} +static inline void bch_btree_verify(struct btree *b) {} static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} -static inline int __bch_count_data(struct btree *b) { return -1; } -static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} -static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} -#define EBUG_ON(cond) do { if (cond); } while (0) #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 #define bypass_torture_test(d) 0 #endif -#define bch_count_data(b) \ - (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) - -#define bch_check_keys(b, ...) \ -do { \ - if (expensive_debug_checks((b)->c)) \ - __bch_check_keys(b, __VA_ARGS__); \ -} while (0) - #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 00000000000..3a0de4cf977 --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + printk(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static void bch_subtract_dirty(struct bkey *k, + struct cache_set *c, + uint64_t offset, + int sectors) +{ + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); +} + +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + uint64_t old_offset; + unsigned old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (!bch_bkey_equal_header(k, replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + bch_subtract_dirty(k, c, KEY_START(insert), + KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(b->c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + return true; +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + (!GC_MARK(g) || + GC_MARK(g) == GC_MARK_METADATA || + (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + struct bucket *g; + unsigned i, stale; + + if (!KEY_PTRS(k) || + bch_extent_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 00000000000..e4e23409782 --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,13 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +void bch_extent_to_text(char *, size_t, const struct bkey *); +bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 9d32d579082..59e82021b5b 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -95,7 +95,7 @@ reread: left = ca->sb.bucket_size - offset; return ret; } - blocks = set_blocks(j, ca->set); + blocks = set_blocks(j, block_bytes(ca->set)); while (!list_empty(list)) { i = list_first_entry(list, @@ -237,8 +237,14 @@ bsearch: for (i = 0; i < ca->sb.njournal_buckets; i++) if (ja->seq[i] > seq) { seq = ja->seq[i]; - ja->cur_idx = ja->discard_idx = - ja->last_idx = i; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; } } @@ -284,20 +290,15 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; - for (j = 0; j < KEY_PTRS(k); j++) { - struct bucket *g = PTR_BUCKET(c, k, j); - atomic_inc(&g->pin); + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - if (g->prio == BTREE_PRIO && - !ptr_stale(c, k, j)) - g->prio = INITIAL_PRIO; - } - - __bch_btree_mark_key(c, 0, k); + bch_initial_mark_key(c, 0, k); } } } @@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) uint64_t start = i->j.last_seq, end = i->j.seq, n = start; struct keylist keylist; - bch_keylist_init(&keylist); - list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -322,12 +321,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) n, i->j.seq - 1, start, end); for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { trace_bcache_journal_replay_key(k); - bkey_copy(keylist.top, k); - bch_keylist_push(&keylist); + bch_keylist_init_single(&keylist, k); ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) @@ -383,16 +381,15 @@ retry: b = best; if (b) { - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { - rw_unlock(true, b); + mutex_unlock(&b->write_lock); /* We raced */ goto retry; } - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } } @@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j) atomic_set(&fifo_back(&j->pin), 1); j->cur->data->seq = ++j->seq; + j->cur->dirty = false; j->cur->need_write = false; j->cur->data->keys = 0; @@ -579,7 +577,8 @@ static void journal_write_unlocked(struct closure *cl) struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; struct bio *bio; struct bio_list list; @@ -595,7 +594,7 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write, system_wq); } - c->journal.blocks_free -= set_blocks(w->data, c); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); w->data->btree_level = c->root->level; @@ -685,7 +684,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - c) * c->sb.block_size; + block_bytes(c)) * c->sb.block_size; if (sectors <= min_t(size_t, c->journal.blocks_free * c->sb.block_size, @@ -730,7 +729,10 @@ static void journal_write_work(struct work_struct *work) struct cache_set, journal.work); spin_lock(&c->journal.lock); - journal_try_write(c); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); } /* @@ -751,7 +753,7 @@ atomic_t *bch_journal(struct cache_set *c, w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); w->data->keys += bch_keylist_nkeys(keys); ret = &fifo_back(&c->journal.pin); @@ -760,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c, if (parent) { closure_wait(&w->wait, parent); journal_try_write(c); - } else if (!w->need_write) { + } else if (!w->dirty) { + w->dirty = true; schedule_delayed_work(&c->journal.work, msecs_to_jiffies(c->journal_delay_ms)); spin_unlock(&c->journal.lock); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 9180c446507..e3c39457afb 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -95,6 +95,7 @@ struct journal_write { struct cache_set *c; struct closure_waitlist wait; + bool dirty; bool need_write; }; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9eb60d102de..cd7490311e5 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) moving_gc_keys); unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *g = PTR_BUCKET(c, k, i); - - if (GC_MOVE(g)) + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + GC_MOVE(PTR_BUCKET(c, k, i))) return true; - } return false; } @@ -115,7 +113,7 @@ static void write_moving(struct closure *cl) closure_call(&op->cl, bch_data_insert, NULL, cl); } - continue_at(cl, write_moving_finish, system_wq); + continue_at(cl, write_moving_finish, op->wq); } static void read_moving_submit(struct closure *cl) @@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl) bch_submit_bbio(bio, io->op.c, &io->w->key, 0); - continue_at(cl, write_moving, system_wq); + continue_at(cl, write_moving, io->op.wq); } static void read_moving(struct cache_set *c) @@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c) io->w = w; io->op.inode = KEY_INODE(&w->key); io->op.c = c; + io->op.wq = c->moving_gc_wq; moving_init(io); bio = &io->bio.bio; @@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c) ca->heap.used = 0; for_each_bucket(b, ca) { - if (!GC_SECTORS_USED(b)) + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) continue; if (!heap_full(&ca->heap)) { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index cce02f19e6c..15fff4f68a7 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -12,11 +12,9 @@ #include "request.h" #include "writeback.h" -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> -#include "blk-cgroup.h" #include <trace/events/bcache.h> @@ -27,172 +25,13 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -/* Cgroup interface */ - -#ifdef CONFIG_CGROUP_BCACHE -static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; - -static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) -{ - struct cgroup_subsys_state *css; - return cgroup && - (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) -{ - struct cgroup_subsys_state *css = bio->bi_css - ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) - : task_subsys_state(current, bcache_subsys_id); - - return css - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) -{ - char tmp[1024]; - int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, - cgroup_to_bcache(cgrp)->cache_mode + 1); - - if (len < 0) - return len; - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - int v = bch_read_string_list(buf, bch_cache_modes); - if (v < 0) - return v; - - cgroup_to_bcache(cgrp)->cache_mode = v - 1; - return 0; -} - -static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) -{ - return cgroup_to_bcache(cgrp)->verify; -} - -static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - cgroup_to_bcache(cgrp)->verify = val; - return 0; -} - -static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_hits); -} - -static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_misses); -} - -static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_hits); -} - -static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_misses); -} - -static struct cftype bch_files[] = { - { - .name = "cache_mode", - .read = cache_mode_read, - .write_string = cache_mode_write, - }, - { - .name = "verify", - .read_u64 = bch_verify_read, - .write_u64 = bch_verify_write, - }, - { - .name = "cache_hits", - .read_u64 = bch_cache_hits_read, - }, - { - .name = "cache_misses", - .read_u64 = bch_cache_misses_read, - }, - { - .name = "cache_bypass_hits", - .read_u64 = bch_cache_bypass_hits_read, - }, - { - .name = "cache_bypass_misses", - .read_u64 = bch_cache_bypass_misses_read, - }, - { } /* terminate */ -}; - -static void init_bch_cgroup(struct bch_cgroup *cg) -{ - cg->cache_mode = -1; -} - -static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) -{ - struct bch_cgroup *cg; - - cg = kzalloc(sizeof(*cg), GFP_KERNEL); - if (!cg) - return ERR_PTR(-ENOMEM); - init_bch_cgroup(cg); - return &cg->css; -} - -static void bcachecg_destroy(struct cgroup *cgroup) -{ - struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - free_css_id(&bcache_subsys, &cg->css); - kfree(cg); -} - -struct cgroup_subsys bcache_subsys = { - .create = bcachecg_create, - .destroy = bcachecg_destroy, - .subsys_id = bcache_subsys_id, - .name = "bcache", - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(bcache_subsys); -#endif - static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - int r = bch_bio_to_cgroup(bio)->cache_mode; - if (r >= 0) - return r; -#endif return BDEV_CACHE_MODE(&dc->sb); } static bool verify(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - if (bch_bio_to_cgroup(bio)->verify) - return true; -#endif return dc->verify; } @@ -249,12 +88,30 @@ static void bch_data_insert_keys(struct closure *cl) atomic_dec_bug(journal_ref); if (!op->insert_data_done) - continue_at(cl, bch_data_insert_start, bcache_wq); + continue_at(cl, bch_data_insert_start, op->wq); bch_keylist_free(&op->insert_keys); closure_return(cl); } +static int bch_keylist_realloc(struct keylist *l, unsigned u64s, + struct cache_set *c) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + + /* + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; + + return __bch_keylist_realloc(l, u64s); +} + static void bch_data_invalidate(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); @@ -267,7 +124,7 @@ static void bch_data_invalidate(struct closure *cl) unsigned sectors = min(bio_sectors(bio), 1U << (KEY_SIZE_BITS - 1)); - if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) goto out; bio->bi_iter.bi_sector += sectors; @@ -280,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl) op->insert_data_done = true; bio_put(bio); out: - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); } static void bch_data_insert_error(struct closure *cl) @@ -323,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error) if (op->writeback) op->error = error; else if (!op->replace) - set_closure_fn(cl, bch_data_insert_error, bcache_wq); + set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } @@ -336,14 +193,14 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (op->bypass) - return bch_data_invalidate(cl); - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); wake_up_gc(op->c); } + if (op->bypass) + return bch_data_invalidate(cl); + /* * Journal writes are marked REQ_FLUSH; if the original write was a * flush, it'll wait on the journal write. @@ -357,9 +214,9 @@ static void bch_data_insert_start(struct closure *cl) /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, - 1 + (op->csum ? 1 : 0), + 3 + (op->csum ? 1 : 0), op->c)) - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); k = op->insert_keys.top; bkey_init(k); @@ -396,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl) } while (n != bio); op->insert_data_done = true; - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ BUG_ON(op->writeback); @@ -425,7 +282,7 @@ err: bio_put(bio); if (!bch_keylist_empty(&op->insert_keys)) - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); else closure_return(cl); } @@ -807,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.error = 0; s->iop.flags = 0; s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.wq = bcache_wq; return s; } @@ -1186,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - struct bio_vec bv; - struct bvec_iter iter; - - /* Zero fill bio */ - - bio_for_each_segment(bv, bio, iter) { - unsigned j = min(bv.bv_len >> 9, sectors); - - void *p = kmap(bv.bv_page); - memset(p + bv.bv_offset, 0, j << 9); - kunmap(bv.bv_page); + unsigned bytes = min(sectors, bio_sectors(bio)) << 9; - sectors -= j; - } + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); + bio_advance(bio, bytes); if (!bio->bi_iter.bi_size) return MAP_DONE; @@ -1296,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) void bch_request_exit(void) { -#ifdef CONFIG_CGROUP_BCACHE - cgroup_unload_subsys(&bcache_subsys); -#endif if (bch_search_cache) kmem_cache_destroy(bch_search_cache); } @@ -1309,11 +1155,5 @@ int __init bch_request_init(void) if (!bch_search_cache) return -ENOMEM; -#ifdef CONFIG_CGROUP_BCACHE - cgroup_load_subsys(&bcache_subsys); - init_bch_cgroup(&bcache_default_cgroup); - - cgroup_add_cftypes(&bcache_subsys, bch_files); -#endif return 0; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 39f21dbedc3..1ff36875c2b 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,12 +1,11 @@ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ -#include <linux/cgroup.h> - struct data_insert_op { struct closure cl; struct cache_set *c; struct bio *bio; + struct workqueue_struct *wq; unsigned inode; uint16_t write_point; @@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d); extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; -struct bch_cgroup { -#ifdef CONFIG_CGROUP_BCACHE - struct cgroup_subsys_state css; -#endif - /* - * We subtract one from the index into bch_cache_modes[], so that - * default == -1; this makes it so the rest match up with d->cache_mode, - * and we use d->cache_mode if cgrp->cache_mode < 0 - */ - short cache_mode; - bool verify; - struct cache_stat_collector stats; -}; - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); - #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 84d0782f702..0ca072c20d0 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, struct cached_dev *dc = container_of(d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); mark_cache_stats(&c->accounting.collector, hit, bypass); -#ifdef CONFIG_CGROUP_BCACHE - mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); -#endif } void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 677a604e7f3..926ded8ccbf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include "request.h" #include "writeback.h" @@ -383,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) @@ -399,7 +400,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -540,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) closure_sync(cl); } -#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ - fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) - void bch_prio_write(struct cache *ca) { int i; @@ -553,10 +551,6 @@ void bch_prio_write(struct cache *ca) lockdep_assert_held(&ca->set->bucket_lock); - for (b = ca->buckets; - b < ca->buckets + ca->sb.nbuckets; b++) - b->disk_gen = b->gen; - ca->disk_buckets->seq++; atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), @@ -600,14 +594,17 @@ void bch_prio_write(struct cache *ca) mutex_lock(&ca->set->bucket_lock); - ca->need_save_prio = 0; - /* * Don't want the old priorities to get garbage collected until after we * finish writing the new ones, and they're journalled */ - for (i = 0; i < prio_buckets(ca); i++) + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } } static void prio_read(struct cache *ca, uint64_t bucket) @@ -638,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) } b->prio = le16_to_cpu(d->prio); - b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; + b->gen = b->last_gc = d->gen; } } @@ -842,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.max_segment_size = UINT_MAX; q->limits.max_segments = BIO_MAX_PAGES; q->limits.max_discard_sectors = UINT_MAX; + q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; @@ -1351,9 +1349,11 @@ static void cache_set_free(struct closure *cl) if (ca) kobject_put(&ca->kobj); + bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); - free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); + if (c->moving_gc_wq) + destroy_workqueue(c->moving_gc_wq); if (c->bio_split) bioset_free(c->bio_split); if (c->fill_iter) @@ -1394,14 +1394,21 @@ static void cache_set_flush(struct closure *cl) list_add(&c->root->list, &c->btree_cache); /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) kthread_stop(ca->alloc_thread); + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + closure_return(cl); } @@ -1477,25 +1484,20 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; + c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - c->sort_crit_factor = int_sqrt(c->btree_pages); - sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); - init_waitqueue_head(&c->try_wait); + init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); sema_init(&c->uuid_write_mutex, 1); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time.lock); spin_lock_init(&c->btree_gc_time.lock); spin_lock_init(&c->btree_split_time.lock); spin_lock_init(&c->btree_read_time.lock); - spin_lock_init(&c->try_harder_time.lock); bch_moving_init_cache_set(c); @@ -1519,11 +1521,12 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->moving_gc_wq = create_workqueue("bcache_gc")) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c)) + bch_open_buckets_alloc(c) || + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -1579,11 +1582,11 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, true); + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1599,7 +1602,7 @@ static void run_cache_set(struct cache_set *c) goto err; bch_journal_mark(c, &journal); - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); pr_debug("btree_check() done"); /* @@ -1641,7 +1644,7 @@ static void run_cache_set(struct cache_set *c) ca->sb.d[j] = ca->sb.first_bucket + j; } - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); err = "error starting allocator thread"; for_each_cache(ca, c, i) @@ -1658,12 +1661,14 @@ static void run_cache_set(struct cache_set *c) goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, true); + c->root = bch_btree_node_alloc(c, NULL, 0); if (IS_ERR_OR_NULL(c->root)) goto err; + mutex_lock(&c->root->write_lock); bkey_copy_key(&c->root->key, &MAX_KEY); bch_btree_node_write(c->root, &cl); + mutex_unlock(&c->root->write_lock); bch_btree_set_root(c->root); rw_unlock(true, c->root); @@ -1785,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj) vfree(ca->buckets); free_heap(&ca->heap); - free_fifo(&ca->unused); free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) @@ -1822,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || - !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || @@ -1837,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) for_each_bucket(b, ca) atomic_set(&b->pin, 0); - if (bch_cache_allocator_init(ca)) - goto err; - return 0; -err: - kobject_put(&ca->kobj); - return -ENOMEM; } static void register_cache(struct cache_sb *sb, struct page *sb_page, @@ -1872,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; + mutex_lock(&bch_register_lock); err = register_cache_set(ca); + mutex_unlock(&bch_register_lock); + if (err) goto err; @@ -1934,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - mutex_lock(&bch_register_lock); - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) goto err; @@ -1968,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!dc) goto err_close; + mutex_lock(&bch_register_lock); register_bdev(sb, sb_page, bdev, dc); + mutex_unlock(&bch_register_lock); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) @@ -1981,7 +1981,6 @@ out: put_page(sb_page); kfree(sb); kfree(path); - mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; @@ -2060,7 +2059,6 @@ static void bcache_exit(void) { bch_debug_exit(); bch_request_exit(); - bch_btree_exit(); if (bcache_kobj) kobject_put(bcache_kobj); if (bcache_wq) @@ -2090,7 +2088,6 @@ static int __init bcache_init(void) if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || sysfs_create_files(bcache_kobj, files) || - bch_btree_init() || bch_request_init() || bch_debug_init(bcache_kobj)) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d5dd282b176..b3ff57d61dd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); sysfs_time_stats_attribute(btree_sort, ms, us); sysfs_time_stats_attribute(btree_read, ms, us); -sysfs_time_stats_attribute(try_harder, ms, us); read_attribute(btree_nodes); read_attribute(btree_used_percent); @@ -400,81 +399,123 @@ static struct attribute *bch_flash_dev_files[] = { }; KTYPE(bch_flash_dev); -SHOW(__bch_cache_set) +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) { - unsigned root_usage(struct cache_set *c) - { - unsigned bytes = 0; - struct bkey *k; - struct btree *b; - struct btree_iter iter; + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); - goto lock_root; + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); - do { - rw_unlock(false, b); -lock_root: - b = c->root; - rw_lock(false, b, b->level); - } while (b != c->root); + return MAP_CONTINUE; +} - for_each_key_filter(b, k, &iter, bch_ptr_bad) - bytes += bkey_bytes(k); +static int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + +static unsigned bch_root_usage(struct cache_set *c) +{ + unsigned bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; + + goto lock_root; + + do { rw_unlock(false, b); +lock_root: + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); - return (bytes * 100) / btree_bytes(c); - } + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); - size_t cache_size(struct cache_set *c) - { - size_t ret = 0; - struct btree *b; + rw_unlock(false, b); - mutex_lock(&c->bucket_lock); - list_for_each_entry(b, &c->btree_cache, list) - ret += 1 << (b->page_order + PAGE_SHIFT); + return (bytes * 100) / btree_bytes(c); +} - mutex_unlock(&c->bucket_lock); - return ret; - } +static size_t bch_cache_size(struct cache_set *c) +{ + size_t ret = 0; + struct btree *b; - unsigned cache_max_chain(struct cache_set *c) - { - unsigned ret = 0; - struct hlist_head *h; + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->keys.page_order + PAGE_SHIFT); - mutex_lock(&c->bucket_lock); + mutex_unlock(&c->bucket_lock); + return ret; +} - for (h = c->bucket_hash; - h < c->bucket_hash + (1 << BUCKET_HASH_BITS); - h++) { - unsigned i = 0; - struct hlist_node *p; +static unsigned bch_cache_max_chain(struct cache_set *c) +{ + unsigned ret = 0; + struct hlist_head *h; - hlist_for_each(p, h) - i++; + mutex_lock(&c->bucket_lock); - ret = max(ret, i); - } + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned i = 0; + struct hlist_node *p; - mutex_unlock(&c->bucket_lock); - return ret; - } + hlist_for_each(p, h) + i++; - unsigned btree_used(struct cache_set *c) - { - return div64_u64(c->gc_stats.key_bytes * 100, - (c->gc_stats.nodes ?: 1) * btree_bytes(c)); + ret = max(ret, i); } - unsigned average_key_size(struct cache_set *c) - { - return c->gc_stats.nkeys - ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) - : 0; - } + mutex_unlock(&c->bucket_lock); + return ret; +} +static unsigned bch_btree_used(struct cache_set *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} + +static unsigned bch_average_key_size(struct cache_set *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} + +SHOW(__bch_cache_set) +{ struct cache_set *c = container_of(kobj, struct cache_set, kobj); sysfs_print(synchronous, CACHE_SYNC(&c->sb)); @@ -482,21 +523,20 @@ lock_root: sysfs_hprint(bucket_size, bucket_bytes(c)); sysfs_hprint(block_size, block_bytes(c)); sysfs_print(tree_depth, c->root->level); - sysfs_print(root_usage_percent, root_usage(c)); + sysfs_print(root_usage_percent, bch_root_usage(c)); - sysfs_hprint(btree_cache_size, cache_size(c)); - sysfs_print(btree_cache_max_chain, cache_max_chain(c)); + sysfs_hprint(btree_cache_size, bch_cache_size(c)); + sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); - sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); - sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); - sysfs_print(btree_used_percent, btree_used(c)); + sysfs_print(btree_used_percent, bch_btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(average_key_size, average_key_size(c)); + sysfs_hprint(average_key_size, bch_average_key_size(c)); sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); @@ -667,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { sysfs_time_stats_attribute_list(btree_split, sec, us) sysfs_time_stats_attribute_list(btree_sort, ms, us) sysfs_time_stats_attribute_list(btree_read, ms, us) - sysfs_time_stats_attribute_list(try_harder, ms, us) &sysfs_btree_nodes, &sysfs_btree_used_percent, @@ -719,7 +758,9 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - size_t n = ca->sb.nbuckets, i, unused, btree; + struct bucket *b; + size_t n = ca->sb.nbuckets, i; + size_t unused = 0, available = 0, dirty = 0, meta = 0; uint64_t sum = 0; /* Compute 31 quantiles */ uint16_t q[31], *p, *cached; @@ -730,6 +771,17 @@ SHOW(__bch_cache) return -ENOMEM; mutex_lock(&ca->set->bucket_lock); + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + unused++; + if (GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + if (GC_MARK(b) == GC_MARK_DIRTY) + dirty++; + if (GC_MARK(b) == GC_MARK_METADATA) + meta++; + } + for (i = ca->sb.first_bucket; i < n; i++) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); @@ -744,10 +796,7 @@ SHOW(__bch_cache) while (cached < p + n && *cached == BTREE_PRIO) - cached++; - - btree = cached - p; - n -= btree; + cached++, n--; for (i = 0; i < n; i++) sum += INITIAL_PRIO - cached[i]; @@ -763,12 +812,16 @@ SHOW(__bch_cache) ret = scnprintf(buf, PAGE_SIZE, "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" "Metadata: %zu%%\n" "Average: %llu\n" "Sectors per Q: %zu\n" "Quantiles: [", unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index adbc3df17a8..b7820b0d262 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1030c6020e9..ac7d0d1f70d 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -2,6 +2,7 @@ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H +#include <linux/blkdev.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/llist.h> @@ -17,11 +18,13 @@ struct closure; #ifdef CONFIG_BCACHE_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) #else /* DEBUG */ +#define EBUG_ON(cond) do { if (cond); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) @@ -391,6 +394,11 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + #define NSEC_PER_ns 1L #define NSEC_PER_us NSEC_PER_USEC #define NSEC_PER_ms NSEC_PER_MSEC |
