diff options
Diffstat (limited to 'drivers/md/bcache')
29 files changed, 4991 insertions, 5168 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f950c9d29f3..4d200883c50 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -13,15 +13,8 @@ config BCACHE_DEBUG ---help--- Don't select this option unless you're a developer - Enables extra debugging tools (primarily a fuzz tester) - -config BCACHE_EDEBUG - bool "Extended runtime checks" - depends on BCACHE - ---help--- - Don't select this option unless you're a developer - - Enables extra runtime checks which significantly affect performance + Enables extra debugging tools, allows expensive runtime checks to be + turned on. config BCACHE_CLOSURES_DEBUG bool "Debug closures" @@ -31,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. - -# cgroup code needs to be updated: -# -#config CGROUP_BCACHE -# bool "Cgroup controls for bcache" -# depends on BCACHE && BLK_CGROUP -# ---help--- -# TODO diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c82523be..c488b846f83 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ - movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e45f5575fd4..443d03fbac4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,13 +63,12 @@ #include "bcache.h" #include "btree.h" +#include <linux/blkdev.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/random.h> #include <trace/events/bcache.h> -#define MAX_IN_FLIGHT_DISCARDS 8U - /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -79,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); - if (CACHE_SYNC(&ca->set->sb)) { - ca->need_save_prio = max(ca->need_save_prio, - bucket_disk_gen(b)); - WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); - } - return ret; } @@ -121,119 +114,63 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Discard/TRIM */ - -struct discard { - struct list_head list; - struct work_struct work; - struct cache *ca; - long bucket; - - struct bio bio; - struct bio_vec bv; -}; - -static void discard_finish(struct work_struct *w) -{ - struct discard *d = container_of(w, struct discard, work); - struct cache *ca = d->ca; - char buf[BDEVNAME_SIZE]; - - if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { - pr_notice("discard error on %s, disabling", - bdevname(ca->bdev, buf)); - d->ca->discard = 0; - } - - mutex_lock(&ca->set->bucket_lock); - - fifo_push(&ca->free, d->bucket); - list_add(&d->list, &ca->discards); - atomic_dec(&ca->discards_in_flight); - - mutex_unlock(&ca->set->bucket_lock); - - closure_wake_up(&ca->set->bucket_wait); - wake_up_process(ca->alloc_thread); - - closure_put(&ca->set->cl); -} - -static void discard_endio(struct bio *bio, int error) -{ - struct discard *d = container_of(bio, struct discard, bio); - schedule_work(&d->work); -} - -static void do_discard(struct cache *ca, long bucket) -{ - struct discard *d = list_first_entry(&ca->discards, - struct discard, list); - - list_del(&d->list); - d->bucket = bucket; - - atomic_inc(&ca->discards_in_flight); - closure_get(&ca->set->cl); - - bio_init(&d->bio); - - d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); - d->bio.bi_bdev = ca->bdev; - d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; - d->bio.bi_max_vecs = 1; - d->bio.bi_io_vec = d->bio.bi_inline_vecs; - d->bio.bi_size = bucket_bytes(ca); - d->bio.bi_end_io = discard_endio; - bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - submit_bio(0, &d->bio); -} - -/* Allocation */ +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ static inline bool can_inc_bucket_gen(struct bucket *b) { - return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && - bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; } -bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); + BUG_ON(!ca->set->gc_mark_valid); - if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && - CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) - return false; - - b->prio = 0; - - if (can_inc_bucket_gen(b) && - fifo_push(&ca->unused, b - ca->buckets)) { - atomic_inc(&b->pin); - return true; - } - - return false; -} - -static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) -{ - return GC_MARK(b) == GC_MARK_RECLAIMABLE && + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) && can_inc_bucket_gen(b); } -static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, b - ca->buckets); + bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free_inc, b - ca->buckets); } -#define bucket_prio(b) \ - (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) @@ -246,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) ca->heap.used = 0; for_each_bucket(b, ca) { - /* - * If we fill up the unused list, if we then return before - * adding anything to the free_inc list we'll skip writing - * prios/gens and just go back to allocating from the unused - * list: - */ - if (fifo_full(&ca->unused)) - return; - - if (!can_invalidate_bucket(ca, b)) - continue; - - if (!GC_SECTORS_USED(b) && - bch_bucket_add_unused(ca, b)) + if (!bch_can_invalidate_bucket(ca, b)) continue; if (!heap_full(&ca->heap)) @@ -280,11 +204,11 @@ static void invalidate_buckets_lru(struct cache *ca) * multiple times when it can't do anything */ ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } - invalidate_one_bucket(ca, b); + bch_invalidate_one_bucket(ca, b); } } @@ -300,12 +224,12 @@ static void invalidate_buckets_fifo(struct cache *ca) b = ca->buckets + ca->fifo_last_bucket++; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -325,12 +249,12 @@ static void invalidate_buckets_random(struct cache *ca) b = ca->buckets + n; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -338,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) static void invalidate_buckets(struct cache *ca) { - if (ca->invalidate_needs_gc) - return; + BUG_ON(ca->invalidate_needs_gc); switch (CACHE_REPLACEMENT(&ca->sb)) { case CACHE_REPLACEMENT_LRU: @@ -352,8 +275,6 @@ static void invalidate_buckets(struct cache *ca) invalidate_buckets_random(ca); break; } - - trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -374,6 +295,21 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +static int bch_allocator_push(struct cache *ca, long bucket) +{ + unsigned i; + + /* Prios/gens are actually the most important reserve */ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + return true; + + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) + return true; + + return false; +} + static int bch_allocator_thread(void *arg) { struct cache *ca = arg; @@ -386,28 +322,22 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (1) { + while (!fifo_empty(&ca->free_inc)) { long bucket; - if ((!atomic_read(&ca->set->prio_blocked) || - !CACHE_SYNC(&ca->set->sb)) && - !fifo_empty(&ca->unused)) - fifo_pop(&ca->unused, bucket); - else if (!fifo_empty(&ca->free_inc)) - fifo_pop(&ca->free_inc, bucket); - else - break; - - allocator_wait(ca, (int) fifo_free(&ca->free) > - atomic_read(&ca->discards_in_flight)); + fifo_pop(&ca->free_inc, bucket); if (ca->discard) { - allocator_wait(ca, !list_empty(&ca->discards)); - do_discard(ca, bucket); - } else { - fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.block_size, GFP_KERNEL, 0); + mutex_lock(&ca->set->bucket_lock); } + + allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); + wake_up(&ca->set->bucket_wait); } /* @@ -416,9 +346,9 @@ static int bch_allocator_thread(void *arg) * them to the free_inc list: */ +retry_invalidate: allocator_wait(ca, ca->set->gc_mark_valid && - (ca->need_save_prio > 64 || - !ca->invalidate_needs_gc)); + !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -426,82 +356,111 @@ static int bch_allocator_thread(void *arg) * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb) && - (!fifo_empty(&ca->free_inc) || - ca->need_save_prio > 64)) + if (CACHE_SYNC(&ca->set->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + bch_prio_write(ca); + } } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +/* Allocation */ + +long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { - long r = -1; -again: + DEFINE_WAIT(w); + struct bucket *b; + long r; + + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) + goto out; + + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); + return -1; + } + + do { + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && + !fifo_pop(&ca->free[reserve], r)); + + finish_wait(&ca->set->bucket_wait, &w); +out: wake_up_process(ca->alloc_thread); - if (fifo_used(&ca->free) > ca->watermark[watermark] && - fifo_pop(&ca->free, r)) { - struct bucket *b = ca->buckets + r; -#ifdef CONFIG_BCACHE_EDEBUG + trace_bcache_alloc(ca, reserve); + + if (expensive_debug_checks(ca->set)) { size_t iter; long i; + unsigned j; for (iter = 0; iter < prio_buckets(ca) * 2; iter++) BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); - fifo_for_each(i, &ca->free, iter) - BUG_ON(i == r); + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each(i, &ca->free[j], iter) + BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); - fifo_for_each(i, &ca->unused, iter) - BUG_ON(i == r); -#endif - BUG_ON(atomic_read(&b->pin) != 1); - - SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - - if (watermark <= WATERMARK_METADATA) { - SET_GC_MARK(b, GC_MARK_METADATA); - b->prio = BTREE_PRIO; - } else { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - b->prio = INITIAL_PRIO; - } - - return r; } - trace_bcache_alloc_fail(ca); + b = ca->buckets + r; - if (cl) { - closure_wait(&ca->set->bucket_wait, cl); + BUG_ON(atomic_read(&b->pin) != 1); - if (closure_blocking(cl)) { - mutex_unlock(&ca->set->bucket_lock); - closure_sync(cl); - mutex_lock(&ca->set->bucket_lock); - goto again; - } + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (reserve <= RESERVE_PRIO) { + SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); + b->prio = INITIAL_PRIO; } - return -1; + return r; +} + +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); } void bch_bucket_free(struct cache_set *c, struct bkey *k) { unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *b = PTR_BUCKET(c, k, i); - - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - SET_GC_SECTORS_USED(b, 0); - bch_bucket_add_unused(PTR_CACHE(c, k, i), b); - } + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(PTR_CACHE(c, k, i), + PTR_BUCKET(c, k, i)); } -int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) +int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, + struct bkey *k, int n, bool wait) { int i; @@ -514,7 +473,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, cl); + long b = bch_bucket_alloc(ca, reserve, wait); if (b == -1) goto err; @@ -529,75 +488,209 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, return 0; err: bch_bucket_free(c, k); - __bkey_put(c, k); + bkey_put(c, k); return -1; } -int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) +int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, + struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } -/* Init */ +/* Sector allocator */ -int bch_cache_allocator_start(struct cache *ca) +struct open_bucket { + struct list_head list; + unsigned last_write_point; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned write_point, + struct bkey *alloc) { - struct task_struct *k = kthread_run(bch_allocator_thread, - ca, "bcache_allocator"); - if (IS_ERR(k)) - return PTR_ERR(k); + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } - ca->alloc_thread = k; - return 0; + if (!ret->sectors_free) + ret = NULL; + + return ret; } -void bch_cache_allocator_exit(struct cache *ca) +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, + unsigned write_point, unsigned write_prio, bool wait) { - struct discard *d; + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ - while (!list_empty(&ca->discards)) { - d = list_first_entry(&ca->discards, struct discard, list); - cancel_work_sync(&d->work); - list_del(&d->list); - kfree(d); + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned watermark = write_prio + ? RESERVE_MOVINGGC + : RESERVE_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + return false; + + spin_lock(&c->data_bucket_lock); } -} -int bch_cache_allocator_init(struct cache *ca) -{ - unsigned i; + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; /* - * Reserve: - * Prio/gen writes first - * Then 8 for btree allocations - * Then half for the moving garbage collector + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); - ca->watermark[WATERMARK_PRIO] = 0; + spin_unlock(&c->data_bucket_lock); + return true; +} - ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); +/* Init */ - ca->watermark[WATERMARK_MOVINGGC] = 8 + - ca->watermark[WATERMARK_METADATA]; +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; - ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + - ca->watermark[WATERMARK_MOVINGGC]; + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} - for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { - struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) return -ENOMEM; - d->ca = ca; - INIT_WORK(&d->work, discard_finish); - list_add(&d->list, &ca->discards); + list_add(&b->list, &c->data_buckets); } return 0; } + +int bch_cache_allocator_start(struct cache *ca) +{ + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + + ca->alloc_thread = k; + return 0; +} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b39f6f0b45f..d2ebcf32309 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -177,6 +177,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> #include <linux/list.h> @@ -186,6 +187,7 @@ #include <linux/types.h> #include <linux/workqueue.h> +#include "bset.h" #include "util.h" #include "closure.h" @@ -193,10 +195,8 @@ struct bucket { atomic_t pin; uint16_t prio; uint8_t gen; - uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint8_t gc_gen; - uint16_t gc_mark; + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -205,172 +205,13 @@ struct bucket { */ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); -#define GC_MARK_RECLAIMABLE 0 -#define GC_MARK_DIRTY 1 -#define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); - -struct bkey { - uint64_t high; - uint64_t low; - uint64_t ptr[]; -}; - -/* Enough for a key with 6 pointers */ -#define BKEY_PAD 8 - -#define BKEY_PADDED(key) \ - union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } - -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 - -#define SB_SECTOR 8 -#define SB_SIZE 4096 -#define SB_LABEL_SIZE 32 -#define SB_JOURNAL_BUCKETS 256U -/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ -#define MAX_CACHES_PER_SET 8 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -struct cache_sb { - uint64_t csum; - uint64_t offset; /* sector where this sb was written */ - uint64_t version; - - uint8_t magic[16]; - - uint8_t uuid[16]; - union { - uint8_t set_uuid[16]; - uint64_t set_magic; - }; - uint8_t label[SB_LABEL_SIZE]; - - uint64_t flags; - uint64_t seq; - uint64_t pad[8]; - - union { - struct { - /* Cache devices */ - uint64_t nbuckets; /* device size */ - - uint16_t block_size; /* sectors */ - uint16_t bucket_size; /* sectors */ - - uint16_t nr_in_set; - uint16_t nr_this_dev; - }; - struct { - /* Backing devices */ - uint64_t data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - uint32_t last_mount; /* time_t */ - - uint16_t first_bucket; - union { - uint16_t njournal_buckets; - uint16_t keys; - }; - uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); -BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U - -BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U -BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_VERSION 1 - -/* - * This is the on disk format for btree nodes - a btree node on disk is a list - * of these; within each set the keys are sorted - */ -struct bset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - -/* - * On disk format for priorities and gens - see super.c near prio_write() for - * more. - */ -struct prio_set { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t pad; - - uint64_t next_bucket; - - struct bucket_disk { - uint16_t prio; - uint8_t gen; - } __attribute((packed)) data[]; -}; - -struct uuid_entry { - union { - struct { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - - uint32_t flags; - /* Size of flash only volumes */ - uint64_t sectors; - }; - - uint8_t pad[128]; - }; -}; - -BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 +#define GC_SECTORS_USED_SIZE 13 +#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" @@ -384,8 +225,6 @@ struct keybuf_key { void *private; }; -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - struct keybuf { struct bkey last_scanned; spinlock_t lock; @@ -400,7 +239,7 @@ struct keybuf { struct rb_root keys; -#define KEYBUF_NR 100 +#define KEYBUF_NR 500 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; @@ -429,21 +268,19 @@ struct bcache_device { struct gendisk *disk; - /* If nonzero, we're closing */ - atomic_t closing; - - /* If nonzero, we're detaching/unregistering from cache set */ - atomic_t detaching; - int flush_done; + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 - uint64_t nr_stripes; - unsigned stripe_size_bits; + unsigned nr_stripes; + unsigned stripe_size; atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; unsigned long sectors_dirty_last; long sectors_dirty_derivative; - mempool_t *unaligned_bvec; struct bio_set *bio_split; unsigned data_csum:1; @@ -473,7 +310,8 @@ struct cached_dev { struct cache_sb sb; struct bio sb_bio; struct bio_vec sb_bv[1]; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ atomic_t count; @@ -498,7 +336,7 @@ struct cached_dev { */ atomic_t has_dirty; - struct ratelimit writeback_rate; + struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* @@ -507,10 +345,9 @@ struct cached_dev { */ sector_t last_read; - /* Number of writeback bios in flight */ - atomic_t in_flight; - struct closure_with_timer writeback; - struct closure_waitlist writeback_wait; + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; + struct task_struct *writeback_thread; struct keybuf writeback_keys; @@ -528,8 +365,8 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; - unsigned sequential_merge:1; unsigned verify:1; + unsigned bypass_torture_test:1; unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; @@ -537,22 +374,22 @@ struct cached_dev { unsigned char writeback_percent; unsigned writeback_delay; - int writeback_rate_change; - int64_t writeback_rate_derivative; uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_d_term; unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_d_smooth; }; -enum alloc_watermarks { - WATERMARK_PRIO, - WATERMARK_METADATA, - WATERMARK_MOVINGGC, - WATERMARK_NONE, - WATERMARK_MAX +enum alloc_reserve { + RESERVE_BTREE, + RESERVE_PRIO, + RESERVE_MOVINGGC, + RESERVE_NONE, + RESERVE_NR, }; struct cache { @@ -564,8 +401,6 @@ struct cache { struct kobject kobj; struct block_device *bdev; - unsigned watermark[WATERMARK_MAX]; - struct task_struct *alloc_thread; struct closure prio; @@ -589,14 +424,9 @@ struct cache { * their new gen to disk. After prio_write() finishes writing the new * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) - * - * unused: GC found nothing pointing into these buckets (possibly - * because all the data they contained was overwritten), so we only - * need to discard them before they can be moved to the free list. */ - DECLARE_FIFO(long, free); + DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); - DECLARE_FIFO(long, unused); size_t fifo_last_bucket; @@ -606,13 +436,6 @@ struct cache { DECLARE_HEAP(struct bucket *, heap); /* - * max(gen - disk_gen) for all buckets. When it gets too big we have to - * call prio_write() to keep gens from wrapping. - */ - uint8_t need_save_prio; - unsigned gc_move_threshold; - - /* * If nonzero, we know we aren't going to find any buckets to invalidate * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu @@ -621,15 +444,6 @@ struct cache { bool discard; /* Get rid of? */ - /* - * We preallocate structs for issuing discards to buckets, and keep them - * on this list when they're not in use; do_discard() issues discards - * whenever there's work to do and is called by free_some_buckets() and - * when a discard finishes. - */ - atomic_t discards_in_flight; - struct list_head discards; - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -650,7 +464,6 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ - uint64_t dirty; /* sectors */ unsigned in_use; /* percent */ }; @@ -690,7 +503,8 @@ struct cache_set { uint64_t cached_dev_sectors; struct closure caching; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; mempool_t *search; mempool_t *bio_meta; @@ -735,19 +549,16 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned bucket_cache_used; + unsigned btree_cache_used; /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation. However, only one thread can be doing this - * at a time, for obvious reasons - try_harder and try_wait are - * basically a lock for this that we can wait on asynchronously. The - * btree_root() macro releases the lock when it returns. + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: */ - struct closure *try_harder; - struct closure_waitlist try_wait; - uint64_t try_harder_start; + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -760,7 +571,7 @@ struct cache_set { * written. */ atomic_t prio_blocked; - struct closure_waitlist bucket_wait; + wait_queue_head_t bucket_wait; /* * For any bio we don't skip we subtract the number of sectors from @@ -776,14 +587,14 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - gc_gen) for all buckets. When it gets too big we have to gc + * max(gen - last_gc) for all buckets. When it gets too big we have to gc * to keep gens from wrapping around. */ uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; - struct closure_with_waitlist gc; + struct task_struct *gc_thread; /* Where in the btree gc currently is */ struct bkey gc_done; @@ -796,23 +607,26 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; - struct closure moving_gc; - struct closure_waitlist moving_gc_wait; + wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ - atomic_t in_flight; + struct semaphore moving_in_flight; + + struct workqueue_struct *moving_gc_wq; struct btree *root; #ifdef CONFIG_BCACHE_DEBUG struct btree *verify_data; + struct bset *verify_ondisk; struct mutex verify_lock; #endif unsigned nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); - struct closure_with_waitlist uuid_write; + struct closure uuid_write; + struct semaphore uuid_write_mutex; /* * A btree node on disk could have too many bsets for an iterator to fit @@ -820,13 +634,7 @@ struct cache_set { */ mempool_t *fill_iter; - /* - * btree_sort() is a merge sort and requires temporary space - single - * element mempool - */ - struct mutex sort_lock; - struct bset *sort; - unsigned sort_crit_factor; + struct bset_sort_state sort; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -842,20 +650,23 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - spinlock_t sort_time_lock; - struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; - spinlock_t btree_read_time_lock; struct time_stats btree_read_time; - struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; unsigned error_limit; unsigned error_decay; + unsigned short journal_delay_ms; + bool expensive_debug_checks; unsigned verify:1; unsigned key_merging_disabled:1; unsigned gc_always_rewrite:1; @@ -866,21 +677,6 @@ struct cache_set { struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; -static inline bool key_merging_disabled(struct cache_set *c) -{ -#ifdef CONFIG_BCACHE_DEBUG - return c->key_merging_disabled; -#else - return 0; -#endif -} - -static inline bool SB_IS_BDEV(const struct cache_sb *sb) -{ - return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - struct bbio { unsigned submit_time_us; union { @@ -894,13 +690,8 @@ struct bbio { struct bio bio; }; -static inline unsigned local_clock_us(void) -{ - return local_clock() >> 10; -} - #define BTREE_PRIO USHRT_MAX -#define INITIAL_PRIO 32768 +#define INITIAL_PRIO 32768U #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ @@ -913,80 +704,12 @@ static inline unsigned local_clock_us(void) #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) -#define set_bytes(i) __set_bytes(i, i->keys) - -#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) -#define set_blocks(i, c) __set_blocks(i, (i)->keys, c) - -#define node(i, j) ((struct bkey *) ((i)->d + (j))) -#define end(i) node(i, (i)->keys) - -#define index(i, b) \ - ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ - block_bytes(b->c))) - -#define btree_data_space(b) (PAGE_SIZE << (b)->page_order) - #define prios_per_bucket(c) \ ((bucket_bytes(c) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) #define prio_buckets(c) \ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL - -#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) -#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) -#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) - -/* Bkey fields: all units are in sectors */ - -#define KEY_FIELD(name, field, offset, size) \ - BITMASK(name, struct bkey, field, offset, size) - -#define PTR_FIELD(name, offset, size) \ - static inline uint64_t name(const struct bkey *k, unsigned i) \ - { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ - \ - static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ - { \ - k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ - k->ptr[i] |= v << offset; \ - } - -KEY_FIELD(KEY_PTRS, high, 60, 3) -KEY_FIELD(HEADER_SIZE, high, 58, 2) -KEY_FIELD(KEY_CSUM, high, 56, 2) -KEY_FIELD(KEY_PINNED, high, 55, 1) -KEY_FIELD(KEY_DIRTY, high, 36, 1) - -KEY_FIELD(KEY_SIZE, high, 20, 16) -KEY_FIELD(KEY_INODE, high, 0, 20) - -/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ - -static inline uint64_t KEY_OFFSET(const struct bkey *k) -{ - return k->low; -} - -static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) -{ - k->low = v; -} - -PTR_FIELD(PTR_DEV, 51, 12) -PTR_FIELD(PTR_OFFSET, 8, 43) -PTR_FIELD(PTR_GEN, 0, 8) - -#define PTR_CHECK_DEV ((1 << 12) - 1) - -#define PTR(gen, offset, dev) \ - ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) - static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; @@ -1023,28 +746,25 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } -/* Btree key macros */ +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} -/* - * The high bit being set is a relic from when we used it to do binary - * searches - it told you where a key started. It's not used anymore, - * and can probably be safely dropped. - */ -#define KEY(dev, sector, len) \ -((struct bkey) { \ - .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ - .low = (sector) \ -}) +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); +} -static inline void bkey_init(struct bkey *k) +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) { - *k = KEY(0, 0, 0); + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); } -#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) -#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) -#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) -#define ZERO_KEY KEY(0, 0, 0) +/* Btree key macros */ /* * This is used for various on disk data structures - cache_sb, prio_set, bset, @@ -1052,7 +772,8 @@ static inline void bkey_init(struct bkey *k) */ #define csum_set(i) \ bch_crc64(((void *) (i)) + sizeof(uint64_t), \ - ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) /* Error handling macros */ @@ -1095,14 +816,6 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) -static inline void __bkey_put(struct cache_set *c, struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); -} - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) @@ -1115,16 +828,13 @@ static inline bool cached_dev_get(struct cached_dev *dc) return false; /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return true; } /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). - * - * bucket_disk_gen() returns the difference between the current gen and the gen - * on disk; they're both used to make sure gens don't wrap around. */ static inline uint8_t bucket_gc_gen(struct bucket *b) @@ -1132,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) return b->gen - b->last_gc; } -static inline uint8_t bucket_disk_gen(struct bucket *b) -{ - return b->gen - b->disk_gen; -} - #define BUCKET_GC_GEN_MAX 96U -#define BUCKET_DISK_GEN_MAX 64U #define kobj_attribute_write(n, fn) \ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) @@ -1165,22 +869,26 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); -struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); void bch_generic_make_request(struct bio *, struct bio_split_pool *); void __bch_submit_bbio(struct bio *, struct cache_set *); void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); -bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, struct closure *); +bool bch_can_invalidate_bucket(struct cache *, struct bucket *); +void __bch_invalidate_one_bucket(struct cache *, struct bucket *); + +void __bch_bucket_free(struct cache *, struct bucket *); void bch_bucket_free(struct cache_set *, struct bkey *); +long bch_bucket_alloc(struct cache *, unsigned, bool); int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); +bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -1188,7 +896,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); -extern struct workqueue_struct *bcache_wq, *bch_gc_wq; +extern struct workqueue_struct *bcache_wq; extern const char * const bch_cache_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; @@ -1221,18 +929,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); +void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -void bch_cache_allocator_exit(struct cache *ca); -int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); -void bch_writeback_exit(void); -int bch_writeback_init(void); void bch_request_exit(void); int bch_request_init(void); -void bch_btree_exit(void); -int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8010eed06a5..54541641530 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -5,177 +5,174 @@ * Copyright 2012 Google, Inc. */ -#include "bcache.h" -#include "btree.h" -#include "debug.h" +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include "util.h" +#include "bset.h" + +#include <linux/console.h> #include <linux/random.h> #include <linux/prefetch.h> -/* Keylists */ +#ifdef CONFIG_BCACHE_DEBUG -void bch_keylist_copy(struct keylist *dest, struct keylist *src) +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) { - *dest = *src; + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + printk(KERN_ERR "block %u key %u/%u: ", set, + (unsigned) ((u64 *) k - i->d), i->keys); - if (src->list == src->d) { - size_t n = (uint64_t *) src->top - src->d; - dest->top = (struct bkey *) &dest->d[n]; - dest->list = dest->d; + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + printk(KERN_ERR "Key skipped backwards\n"); } } -int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) +void bch_dump_bucket(struct btree_keys *b) { - unsigned oldsize = (uint64_t *) l->top - l->list; - unsigned newsize = oldsize + 2 + nptrs; - uint64_t *new; - - /* The journalling code doesn't handle the case where the keys to insert - * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far - * and finish the rest when the keylist is empty. - */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) - return -ENOMEM; - - newsize = roundup_pow_of_two(newsize); + unsigned i; - if (newsize <= KEYLIST_INLINE || - roundup_pow_of_two(oldsize) == newsize) - return 0; + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} - new = krealloc(l->list == l->d ? NULL : l->list, - sizeof(uint64_t) * newsize, GFP_NOIO); +int __bch_count_data(struct btree_keys *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; - if (!new) - return -ENOMEM; + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} - if (l->list == l->d) - memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; - l->list = new; - l->top = (struct bkey *) (&l->list[oldsize]); + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; - return 0; -} + if (bch_ptr_invalid(b, k)) + continue; -struct bkey *bch_keylist_pop(struct keylist *l) -{ - struct bkey *k = l->bottom; + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; - if (k == l->top) - return NULL; + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); - while (bkey_next(k) != l->top) - k = bkey_next(k); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); - return l->top = k; + panic("bch_check_keys error: %s:\n", err); } -/* Pointer validation */ - -bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) +static void bch_btree_iter_next_check(struct btree_iter *iter) { - unsigned i; - char buf[80]; + struct bkey *k = iter->data->k, *next = bkey_next(k); - if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) - goto bad; + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} - if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; +#else - if (!KEY_SIZE(k)) - return true; +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size || - bucket < ca->sb.first_bucket || - bucket >= ca->sb.nbuckets) - goto bad; - } +#endif - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); - return true; -} +/* Keylists */ -bool bch_ptr_bad(struct btree *b, const struct bkey *k) +int __bch_keylist_realloc(struct keylist *l, unsigned u64s) { - struct bucket *g; - unsigned i, stale; + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; + uint64_t *new_keys; - if (!bkey_cmp(k, &ZERO_KEY) || - !KEY_PTRS(k) || - bch_ptr_invalid(b, k)) - return true; + newsize = roundup_pow_of_two(newsize); - if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) - return true; + if (newsize <= KEYLIST_INLINE || + roundup_pow_of_two(oldsize) == newsize) + return 0; - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(b->c, k, i)) { - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); + + if (!new_keys) + return -ENOMEM; - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); + if (!old_keys) + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; - if (stale) - return true; + return 0; +} -#ifdef CONFIG_BCACHE_EDEBUG - if (!mutex_trylock(&b->c->bucket_lock)) - continue; +struct bkey *bch_keylist_pop(struct keylist *l) +{ + struct bkey *k = l->keys; - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto bug; - - } else { - if (g->prio == BTREE_PRIO) - goto bug; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto bug; - } - mutex_unlock(&b->c->bucket_lock); -#endif - } + if (k == l->top) + return NULL; - return false; -#ifdef CONFIG_BCACHE_EDEBUG -bug: - mutex_unlock(&b->c->bucket_lock); + while (bkey_next(k) != l->top) + k = bkey_next(k); - { - char buf[80]; + return l->top = k; +} - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - } - return true; -#endif +void bch_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bkey_u64s(l->keys); + + memmove(l->keys, + bkey_next(l->keys), + bch_keylist_bytes(l)); } /* Key/pointer manipulation */ @@ -232,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) return true; } -static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree_keys *b) { - return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & - ~((uint64_t)1 << 63); + return PAGE_SIZE << b->page_order; } -/* Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) +static inline size_t btree_keys_cachelines(struct btree_keys *b) { - unsigned i; + return btree_keys_bytes(b) / BSET_CACHELINE; +} - if (key_merging_disabled(b->c)) - return false; +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} - if (KEY_PTRS(l) != KEY_PTRS(r) || - KEY_DIRTY(l) != KEY_DIRTY(r) || - bkey_cmp(l, &START_KEY(r))) - return false; +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} - for (i = 0; i < KEY_PTRS(l); i++) - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || - PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) - return false; +/* Memory allocation */ - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { - SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); - SET_KEY_SIZE(l, USHRT_MAX); +void bch_btree_keys_free(struct btree_keys *b) +{ + struct bset_tree *t = b->set; - bch_cut_front(l, r); - return false; - } + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); - if (KEY_CSUM(l)) { - if (KEY_CSUM(r)) - l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); - else - SET_KEY_CSUM(l, 0); - } + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); - SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); - SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + free_pages((unsigned long) t->data, b->page_order); - return true; + t->prev = NULL; + t->tree = NULL; + t->data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_free); + +int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +{ + struct bset_tree *t = b->set; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; } +EXPORT_SYMBOL(bch_btree_keys_alloc); + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + unsigned i; + + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* XXX: shouldn't be needed */ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].size = 0; + /* + * Second loop starts at 1 because b->keys[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->set[i].data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -432,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bkey *k) +static unsigned bkey_to_cacheline_offset(struct bset_tree *t, + unsigned cacheline, + struct bkey *k) { - return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) @@ -458,16 +539,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) { -#ifdef CONFIG_X86_64 - asm("shrd %[shift],%[high],%[low]" - : [low] "+Rm" (low) - : [high] "R" (high), - [shift] "ci" (shift) - : "cc"); -#else low >>= shift; low |= (high << 1) << (63U - shift); -#endif return low; } @@ -489,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) : tree_to_prev_bkey(t, j >> ffs(j)); struct bkey *r = is_power_of_2(j + 1) - ? node(t->data, t->data->keys - bkey_u64s(&t->end)) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) : tree_to_bkey(t, j >> (ffz(j) + 1)); BUG_ON(m < l || m > r); @@ -513,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) f->exponent = 127; } -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { - if (t != b->sets) { + if (t != b->set) { unsigned j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); @@ -523,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) t->prev = t[-1].prev + j; } - while (t < b->sets + MAX_BSETS) + while (t < b->set + MAX_BSETS) t++->size = 0; } -static void bset_build_unwritten_tree(struct btree *b) +static void bch_bset_build_unwritten_tree(struct btree_keys *b) { - struct bset_tree *t = b->sets + b->nsets; + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; bset_alloc_tree(b, t); - if (t->tree != b->sets->tree + bset_tree_space(b)) { - t->prev[0] = bkey_to_cacheline_offset(t->data->start); + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); t->size = 1; } } -static void bset_build_written_tree(struct btree *b) +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) { - struct bset_tree *t = b->sets + b->nsets; - struct bkey *k = t->data->start; + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} +EXPORT_SYMBOL(bch_bset_init_next); + +void bch_bset_build_written_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct bkey *prev = NULL, *k = t->data->start; unsigned j, cacheline = 1; + b->last_set_unwritten = 0; + bset_alloc_tree(b, t); t->size = min_t(unsigned, - bkey_to_cacheline(t, end(t->data)), - b->sets->tree + bset_tree_space(b) - t->tree); + bkey_to_cacheline(t, bset_bkey_last(t->data)), + b->set->tree + btree_keys_cachelines(b) - t->tree); if (t->size < 2) { t->size = 0; @@ -562,16 +656,14 @@ static void bset_build_written_tree(struct btree *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) != cacheline) - k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) + prev = k, k = bkey_next(k); - t->prev[j] = bkey_u64s(k); - k = bkey_next(k); - cacheline++; - t->tree[j].m = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); } - while (bkey_next(k) != end(t->data)) + while (bkey_next(k) != bset_bkey_last(t->data)) k = bkey_next(k); t->end = *k; @@ -582,14 +674,17 @@ static void bset_build_written_tree(struct btree *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } +EXPORT_SYMBOL(bch_bset_build_written_tree); -void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) +/* Insert */ + +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; unsigned inorder, j = 1; - for (t = b->sets; t <= &b->sets[b->nsets]; t++) - if (k < end(t->data)) + for (t = b->set; t <= bset_tree_last(b); t++) + if (k < bset_bkey_last(t->data)) goto found_set; BUG(); @@ -602,7 +697,7 @@ found_set: if (k == t->data->start) goto fix_left; - if (bkey_next(k) == end(t->data)) { + if (bkey_next(k) == bset_bkey_last(t->data)) { t->end = *k; goto fix_right; } @@ -627,10 +722,12 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } +EXPORT_SYMBOL(bch_bset_fix_invalidated_key); -void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) +static void bch_bset_fix_lookup_table(struct btree_keys *b, + struct bset_tree *t, + struct bkey *k) { - struct bset_tree *t = &b->sets[b->nsets]; unsigned shift = bkey_u64s(k); unsigned j = bkey_to_cacheline(t, k); @@ -642,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ - if (j < t->size && - table_to_bkey(t, j) <= k) + while (j < t->size && + table_to_bkey(t, j) <= k) j++; /* Adjust all the lookup table entries, and find a new key for any that @@ -658,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) while (k < cacheline_to_bkey(t, j, 0)) k = bkey_next(k); - t->prev[j] = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_to_cacheline_offset(t, j, k); } } - if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) return; /* Possibly add a new entry to the end of the lookup table */ for (k = table_to_bkey(t, t->size - 1); - k != end(t->data); + k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(k); + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); t->size++; } } -void bch_bset_init_next(struct btree *b) +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) { - struct bset *i = write_block(b); + if (!b->ops->key_merge) + return false; - if (i != b->sets[0].data) { - b->sets[++b->nsets].data = i; - i->seq = b->sets[0].data->seq; - } else - get_random_bytes(&i->seq, sizeof(uint64_t)); + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) + return false; - i->magic = bset_magic(b->c); - i->version = 0; - i->keys = 0; + return b->ops->key_merge(b, l, r); +} +EXPORT_SYMBOL(bch_bkey_try_merge); + +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(!b->last_set_unwritten); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); + + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); +} +EXPORT_SYMBOL(bch_bset_insert); + +unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; + + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + m = bch_btree_iter_init(b, &iter, b->ops->is_extents + ? PRECEDING_KEY(&START_KEY(k)) + : PRECEDING_KEY(k)); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; - bset_build_unwritten_tree(b); + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) + prev = m, m = bkey_next(m); + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; } +EXPORT_SYMBOL(bch_btree_insert_key); + +/* Lookup */ struct bset_search_iter { struct bkey *l, *r; }; -static struct bset_search_iter bset_search_write_set(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { unsigned li = 0, ri = t->size; - BUG_ON(!b->nsets && - t->size < bkey_to_cacheline(t, end(t->data))); - while (li + 1 != ri) { unsigned m = (li + ri) >> 1; @@ -717,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, return (struct bset_search_iter) { table_to_bkey(t, li), - ri < t->size ? table_to_bkey(t, ri) : end(t->data) + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) }; } -static struct bset_search_iter bset_search_tree(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_tree(struct bset_tree *t, const struct bkey *search) { struct bkey *l, *r; @@ -769,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, f = &t->tree[inorder_next(j, t->size)]; r = cacheline_to_bkey(t, inorder, f->m); } else - r = end(t->data); + r = bset_bkey_last(t->data); } else { r = cacheline_to_bkey(t, inorder, f->m); @@ -783,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, return (struct bset_search_iter) {l, r}; } -struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search) { struct bset_search_iter i; @@ -805,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; - i.r = end(t->data); + i.r = bset_bkey_last(t->data); } else if (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range @@ -815,25 +981,29 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, */ if (unlikely(bkey_cmp(search, &t->end) >= 0)) - return end(t->data); + return bset_bkey_last(t->data); if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start; - i = bset_search_tree(b, t, search); - } else - i = bset_search_write_set(b, t, search); + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); + + i = bset_search_write_set(t, search); + } -#ifdef CONFIG_BCACHE_EDEBUG - BUG_ON(bset_written(b, t) && - i.l != t->data->start && - bkey_cmp(tree_to_prev_bkey(t, - inorder_to_tree(bkey_to_cacheline(t, i.l), t)), - search) > 0); + if (btree_keys_expensive_checks(b)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); - BUG_ON(i.r != end(t->data) && - bkey_cmp(i.r, search) <= 0); -#endif + BUG_ON(i.r != bset_bkey_last(t->data) && + bkey_cmp(i.r, search) <= 0); + } while (likely(i.l != i.r) && bkey_cmp(i.l, search) <= 0) @@ -841,15 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, return i.l; } +EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - - return c ? c > 0 : l.k < r.k; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) @@ -866,27 +1038,44 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); iter->used = 0; - for (; start <= &b->sets[b->nsets]; start++) { +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + + for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, end(start->data)); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_next(struct btree_iter *iter) +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->set); +} +EXPORT_SYMBOL(bch_btree_iter_init); + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) { struct btree_iter_set unused; struct bkey *ret = NULL; if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + ret = iter->data->k; iter->data->k = bkey_next(iter->data->k); @@ -896,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) } if (iter->data->k == iter->data->end) - heap_pop(iter, unused, btree_iter_cmp); + heap_pop(iter, unused, cmp); else - heap_sift(iter, 0, btree_iter_cmp); + heap_sift(iter, 0, cmp); } return ret; } +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} +EXPORT_SYMBOL(bch_btree_iter_next); + struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, - struct btree *b, ptr_filter_fn fn) + struct btree_keys *b, ptr_filter_fn fn) { struct bkey *ret; @@ -916,63 +1112,60 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, return ret; } -struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) -{ - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, search); - return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); -} - /* Mergesort */ -static void btree_sort_fixup(struct btree_iter *iter) +void bch_bset_sort_state_free(struct bset_sort_state *state) { - while (iter->used > 1) { - struct btree_iter_set *top = iter->data, *i = top + 1; - struct bkey *k; + if (state->pool) + mempool_destroy(state->pool); +} - if (iter->used > 2 && - btree_iter_cmp(i[0], i[1])) - i++; +int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) +{ + spin_lock_init(&state->time.lock); - for (k = i->k; - k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; - k = bkey_next(k)) - if (top->k > i->k) - __bch_cut_front(top->k, k); - else if (KEY_SIZE(k)) - bch_cut_back(&START_KEY(k), top->k); + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); - if (top->k < i->k || k == i->k) - break; + state->pool = mempool_create_page_pool(1, page_order); + if (!state->pool) + return -ENOMEM; - heap_sift(iter, i - top, btree_iter_cmp); - } + return 0; } +EXPORT_SYMBOL(bch_bset_sort_state_init); -static void btree_mergesort(struct btree *b, struct bset *out, +static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; - bool (*bad)(struct btree *, const struct bkey *) = remove_stale + BKEY_PADDED(k) tmp; + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); + while (!btree_iter_end(iter)) { - if (fixup && !b->level) - btree_sort_fixup(iter); + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); - k = bch_btree_iter_next(iter); if (bad(b, k)) continue; if (!last) { last = out->start; bkey_copy(last, k); - } else if (b->level || - !bch_bkey_try_merge(b, last, k)) { + } else if (!bch_bkey_try_merge(b, last, k)) { last = bkey_next(last); bkey_copy(last, k); } @@ -981,30 +1174,32 @@ static void btree_mergesort(struct btree *b, struct bset *out, out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; pr_debug("sorted %i keys", out->keys); - bch_check_key_order(b, out); } -static void __btree_sort(struct btree *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup) +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, + unsigned start, unsigned order, bool fixup, + struct bset_sort_state *state) { uint64_t start_time; - bool remove_stale = !b->written; + bool used_mempool = false; struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { - mutex_lock(&b->c->sort_lock); - out = b->c->sort; - order = ilog2(bucket_pages(b->c)); + struct page *outp; + + BUG_ON(order > state->page_order); + + outp = mempool_alloc(state->pool, GFP_NOIO); + out = page_address(outp); + used_mempool = true; + order = state->page_order; } start_time = local_clock(); - btree_mergesort(b, out, iter, fixup, remove_stale); + btree_mergesort(b, out, iter, fixup, false); b->nsets = start; - if (!fixup && !start && b->written) - bch_btree_verify(b, out); - if (!start && order == b->page_order) { /* * Our temporary buffer is the same size as the btree node's @@ -1012,89 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(b->c); - out->seq = b->sets[0].data->seq; - out->version = b->sets[0].data->version; - swap(out, b->sets[0].data); - - if (b->c->sort == b->sets[0].data) - b->c->sort = out; + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); } else { - b->sets[start].data->keys = out->keys; - memcpy(b->sets[start].data->start, out->start, - (void *) end(out) - (void *) out->start); + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, + (void *) bset_bkey_last(out) - (void *) out->start); } - if (out == b->c->sort) - mutex_unlock(&b->c->sort_lock); + if (used_mempool) + mempool_free(virt_to_page(out), state->pool); else free_pages((unsigned long) out, order); - if (b->written) - bset_build_written_tree(b); + bch_bset_build_written_tree(b); - if (!start) { - spin_lock(&b->c->sort_time_lock); - bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); - } + if (!start) + bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree *b, unsigned start) +void bch_btree_sort_partial(struct btree_keys *b, unsigned start, + struct bset_sort_state *state) { - size_t oldsize = 0, order = b->page_order, keys = 0; + size_t order = b->page_order, keys = 0; struct btree_iter iter; - __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); - - BUG_ON(b->sets[b->nsets].data == write_block(b) && - (b->sets[b->nsets].size || b->nsets)); + int oldsize = bch_count_data(b); - if (b->written) - oldsize = bch_count_data(b); + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned i; for (i = start; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + keys += b->set[i].data->keys; - order = roundup_pow_of_two(__set_bytes(b->sets->data, - keys)) / PAGE_SIZE; - if (order) - order = ilog2(order); + order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false); + __btree_sort(b, &iter, start, order, false, state); - EBUG_ON(b->written && bch_count_data(b) != oldsize); + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } +EXPORT_SYMBOL(bch_btree_sort_partial); -void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state) { - BUG_ON(!b->written); - __btree_sort(b, iter, 0, b->page_order, true); + __btree_sort(b, iter, 0, b->page_order, true, state); } -void bch_btree_sort_into(struct btree *b, struct btree *new) +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state) { uint64_t start_time = local_clock(); struct btree_iter iter; bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(b, new->sets->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter, false, true); - spin_lock(&b->c->sort_time_lock); - bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); + bch_time_stats_update(&state->time, start_time); - bkey_copy_key(&new->key, &b->key); - new->sets->size = 0; + new->set->size = 0; // XXX: why? } #define SORT_CRIT (4096 / sizeof(uint64_t)) -void bch_btree_sort_lazy(struct btree *b) +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { unsigned crit = SORT_CRIT; int i; @@ -1103,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b) if (!b->nsets) goto out; - /* If not a leaf node, always sort */ - if (b->level) { - bch_btree_sort(b); - return; - } - for (i = b->nsets - 1; i >= 0; --i) { - crit *= b->c->sort_crit_factor; + crit *= state->crit_factor; - if (b->sets[i].data->keys < crit) { - bch_btree_sort_partial(b, i); + if (b->set[i].data->keys < crit) { + bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ if (b->nsets + 1 == MAX_BSETS) { - bch_btree_sort(b); + bch_btree_sort(b, state); return; } out: - bset_build_written_tree(b); + bch_bset_build_written_tree(b); } +EXPORT_SYMBOL(bch_btree_sort_lazy); -/* Sysfs stuff */ - -struct bset_stats { - size_t nodes; - size_t sets_written, sets_unwritten; - size_t bytes_written, bytes_unwritten; - size_t floats, failed; -}; - -static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, - struct bset_stats *stats) +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - struct bkey *k; unsigned i; - stats->nodes++; - for (i = 0; i <= b->nsets; i++) { - struct bset_tree *t = &b->sets[i]; + struct bset_tree *t = &b->set[i]; size_t bytes = t->data->keys * sizeof(uint64_t); size_t j; @@ -1164,43 +1328,4 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, stats->bytes_unwritten += bytes; } } - - if (b->level) { - struct btree_iter iter; - - for_each_key_filter(b, k, &iter, bch_ptr_bad) { - int ret = btree(bset_stats, k, b, op, stats); - if (ret) - return ret; - } - } - - return 0; -} - -int bch_bset_print_stats(struct cache_set *c, char *buf) -{ - struct btree_op op; - struct bset_stats t; - int ret; - - bch_btree_op_init_stack(&op); - memset(&t, 0, sizeof(struct bset_stats)); - - ret = btree_root(bset_stats, c, &op, &t); - if (ret) - return ret; - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "unwritten sets: %zu\n" - "written key bytes: %zu\n" - "unwritten key bytes: %zu\n" - "floats: %zu\n" - "failed: %zu\n", - t.nodes, - t.sets_written, t.sets_unwritten, - t.bytes_written, t.bytes_unwritten, - t.floats, t.failed); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae115a253d7..5f6728d5d4d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,7 +1,11 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include <linux/slab.h> +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "util.h" /* for time_stats */ /* * BKEYS: @@ -142,17 +146,13 @@ * first key in that range of bytes again. */ -/* Btree key comparison/iteration */ +struct btree_keys; +struct btree_iter; +struct btree_iter_set; +struct bkey_float; #define MAX_BSETS 4U -struct btree_iter { - size_t size, used; - struct btree_iter_set { - struct bkey *k, *end; - } data[MAX_BSETS]; -}; - struct bset_tree { /* * We construct a binary tree in an array as if the array @@ -162,14 +162,14 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned extra; /* copy of the last key in the set */ - struct bkey end; - struct bkey_float *tree; + struct bkey end; + struct bkey_float *tree; /* * The nodes in the bset tree point to specific keys - this @@ -179,96 +179,227 @@ struct bset_tree { * to keep bkey_float to 4 bytes and prev isn't used in the fast * path. */ - uint8_t *prev; + uint8_t *prev; /* The actual btree node, with pointers to each sorted set */ - struct bset *data; + struct bset *data; }; -static __always_inline int64_t bkey_cmp(const struct bkey *l, - const struct bkey *r) +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*insert_fixup)(struct btree_keys *, struct bkey *, + struct btree_iter *, struct bkey *); + bool (*key_invalid)(struct btree_keys *, + const struct bkey *); + bool (*key_bad)(struct btree_keys *, const struct bkey *); + bool (*key_merge)(struct btree_keys *, + struct bkey *, struct bkey *); + void (*key_to_text)(char *, size_t, const struct bkey *); + void (*key_dump)(struct btree_keys *, const struct bkey *); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) { - return unlikely(KEY_INODE(l) != KEY_INODE(r)) - ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) - : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); + return b->set + b->nsets; } -static inline size_t bkey_u64s(const struct bkey *k) +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) { - BUG_ON(KEY_CSUM(k) > 1); - return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); + return t <= b->set + b->nsets - b->last_set_unwritten; } -static inline size_t bkey_bytes(const struct bkey *k) +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) { - return bkey_u64s(k) * sizeof(uint64_t); + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; } -static inline void bkey_copy(struct bkey *dest, const struct bkey *src) +static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) { - memcpy(dest, src, bkey_bytes(src)); + return ((size_t) i) - ((size_t) b->set->data); } -static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) { - if (!src) - src = &KEY(0, 0, 0); + return bset_byte_offset(b, i) >> 9; +} + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) - SET_KEY_INODE(dest, KEY_INODE(src)); - SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); } -static inline struct bkey *bkey_next(const struct bkey *k) +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned block_bytes) { - uint64_t *d = (void *) k; - return (struct bkey *) (d + bkey_u64s(k)); + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); } -/* Keylists */ +void bch_btree_keys_free(struct btree_keys *); +int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); +void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, + bool *); + +void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); +void bch_bset_build_written_tree(struct btree_keys *); +void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); +void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); +unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, + struct bkey *); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; -struct keylist { - struct bkey *top; - union { - uint64_t *list; - struct bkey *bottom; - }; +/* Btree key iteration */ - /* Enough room for btree_split's keys without realloc */ -#define KEYLIST_INLINE 16 - uint64_t d[KEYLIST_INLINE]; +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree_keys *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; }; -static inline void bch_keylist_init(struct keylist *l) +typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); + +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree_keys *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, + struct bkey *); + +struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, + const struct bkey *); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, + const struct bkey *search) { - l->top = (void *) (l->list = l->d); + return search ? __bch_bset_search(b, t, search) : t->data->start; } -static inline void bch_keylist_push(struct keylist *l) +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Sorting */ + +struct bset_sort_state { + mempool_t *pool; + + unsigned page_order; + unsigned crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *); +int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); +void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, + struct bset_sort_state *); +void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, + struct bset_sort_state *); +void bch_btree_sort_partial(struct btree_keys *, unsigned, + struct bset_sort_state *); + +static inline void bch_btree_sort(struct btree_keys *b, + struct bset_sort_state *state) { - l->top = bkey_next(l->top); + bch_btree_sort_partial(b, 0, state); } -static inline void bch_keylist_add(struct keylist *l, struct bkey *k) +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); + +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) { - bkey_copy(l->top, k); - bch_keylist_push(l); + return bkey_idx(i->start, idx); } -static inline bool bch_keylist_empty(struct keylist *l) +static inline void bkey_init(struct bkey *k) { - return l->top == (void *) l->list; + *k = ZERO_KEY; } -static inline void bch_keylist_free(struct keylist *l) +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) { - if (l->list != l->d) - kfree(l->list); + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -void bch_keylist_copy(struct keylist *, struct keylist *); -struct bkey *bch_keylist_pop(struct keylist *); -int bch_keylist_realloc(struct keylist *, int, struct cache_set *); - void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, unsigned); bool __bch_cut_front(const struct bkey *, struct bkey *); @@ -286,98 +417,150 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) return __bch_cut_back(where, k); } -const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); -bool bch_ptr_bad(struct btree *, const struct bkey *); - -static inline uint8_t gen_after(uint8_t a, uint8_t b) +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) { - uint8_t r = a - b; - return r > 128U ? 0 : r; + return b->ops->key_invalid(b, k); } -static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) { - return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); + return b->ops->key_bad(b, k); } -static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) { - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); + return b->ops->key_to_text(buf, size, k); } +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(l)); +} -typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); +/* Keylists */ -struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree *, ptr_filter_fn); +struct keylist { + union { + struct bkey *keys; + uint64_t *keys_p; + }; + union { + struct bkey *top; + uint64_t *top_p; + }; -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, - struct bkey *, struct bset_tree *); + /* Enough room for btree_split's keys without realloc */ +#define KEYLIST_INLINE 16 + uint64_t inline_keys[KEYLIST_INLINE]; +}; -/* 32 bits total: */ -#define BKEY_MID_BITS 3 -#define BKEY_EXPONENT_BITS 7 -#define BKEY_MANTISSA_BITS 22 -#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) +static inline void bch_keylist_init(struct keylist *l) +{ + l->top_p = l->keys_p = l->inline_keys; +} -struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; -} __packed; +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ +static inline void bch_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} -#define BSET_CACHELINE 128 -#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) +static inline void bch_keylist_add(struct keylist *l, struct bkey *k) +{ + bkey_copy(l->top, k); + bch_keylist_push(l); +} -#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) -#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) +static inline bool bch_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} -void bch_bset_init_next(struct btree *); +static inline void bch_keylist_reset(struct keylist *l) +{ + l->top = l->keys; +} -void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); -void bch_bset_fix_lookup_table(struct btree *, struct bkey *); +static inline void bch_keylist_free(struct keylist *l) +{ + if (l->keys_p != l->inline_keys) + kfree(l->keys_p); +} -struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, - const struct bkey *); +static inline size_t bch_keylist_nkeys(struct keylist *l) +{ + return l->top_p - l->keys_p; +} -static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, - const struct bkey *search) +static inline size_t bch_keylist_bytes(struct keylist *l) { - return search ? __bch_bset_search(b, t, search) : t->data->start; + return bch_keylist_nkeys(l) * sizeof(uint64_t); } -bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); -void bch_btree_sort_lazy(struct btree *); -void bch_btree_sort_into(struct btree *, struct btree *); -void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); -void bch_btree_sort_partial(struct btree *, unsigned); +struct bkey *bch_keylist_pop(struct keylist *); +void bch_keylist_pop_front(struct keylist *); +int __bch_keylist_realloc(struct keylist *, unsigned); + +/* Debug stuff */ + +#ifdef CONFIG_BCACHE_DEBUG + +int __bch_count_data(struct btree_keys *); +void __bch_check_keys(struct btree_keys *, const char *, ...); +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bucket(struct btree_keys *); + +#else + +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); + +#endif + +static inline bool btree_keys_expensive_checks(struct btree_keys *b) +{ +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif +} -static inline void bch_btree_sort(struct btree *b) +static inline int bch_count_data(struct btree_keys *b) { - bch_btree_sort_partial(b, 0); + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; } -int bch_bset_print_stats(struct cache_set *, char *); +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f9764e61978..7347b610096 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,12 +23,13 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" -#include "writeback.h" +#include "extents.h" #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/freezer.h> #include <linux/hash.h> +#include <linux/kthread.h> #include <linux/prefetch.h> #include <linux/random.h> #include <linux/rcupdate.h> @@ -67,15 +68,11 @@ * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * - * Make sure all allocations get charged to the root cgroup - * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * - * Also lookup by cgroup in get_open_bucket() - * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight @@ -88,15 +85,6 @@ * Test module load/unload */ -static const char * const op_types[] = { - "insert", "replace" -}; - -static const char *op_type(struct btree_op *op) -{ - return op_types[op->type]; -} - #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -105,23 +93,96 @@ static const char *op_type(struct btree_op *op) #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -struct workqueue_struct *bch_gc_wq; -static struct workqueue_struct *btree_io_wq; +#define insert_lock(s, b) ((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ + if (!IS_ERR(_child)) { \ + _child->parent = (b); \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _b->parent = NULL; \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); -void bch_btree_op_init_stack(struct btree_op *op) -{ - memset(op, 0, sizeof(struct btree_op)); - closure_init_stack(&op->cl); - op->lock = -1; - bch_keylist_init(&op->keys); } /* Btree key manipulation */ -static void bkey_put(struct cache_set *c, struct bkey *k, int level) +void bkey_put(struct cache_set *c, struct bkey *k) { - if ((level && KEY_OFFSET(k)) || !level) - __bkey_put(c, k); + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } /* Btree IO */ @@ -129,38 +190,43 @@ static void bkey_put(struct cache_set *c, struct bkey *k, int level) static uint64_t btree_csum_set(struct btree *b, struct bset *i) { uint64_t crc = b->key.ptr[0]; - void *data = (void *) i + 8, *end = end(i); + void *data = (void *) i + 8, *end = bset_bkey_last(i); crc = bch_crc64_update(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } -static void bch_btree_node_read_done(struct btree *b) +void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; - struct bset *i = b->sets[0].data; + struct bset *i = btree_bset_first(b); struct btree_iter *iter; iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = &b->keys; +#endif + if (!i->seq) goto err; for (; - b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; i = write_block(b)) { err = "unsupported bset version"; if (i->version > BCACHE_BSET_VERSION) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + if (b->written + set_blocks(i, block_bytes(b->c)) > + btree_blocks(b)) goto err; err = "bad magic"; - if (i->magic != bset_magic(b->c)) + if (i->magic != bset_magic(&b->c->sb)) goto err; err = "bad checksum"; @@ -176,39 +242,40 @@ static void bch_btree_node_read_done(struct btree *b) } err = "empty set"; - if (i != b->sets[0].data && !i->keys) + if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(iter, i->start, end(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, b->c); + b->written += set_blocks(i, block_bytes(b->c)); } err = "corrupted btree"; for (i = write_block(b); - index(i, b) < btree_blocks(b); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c)) - if (i->seq == b->sets[0].data->seq) + if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(b, iter); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); - i = b->sets[0].data; + i = b->keys.set[0].data; err = "short btree key"; - if (b->sets[0].size && - bkey_cmp(&b->key, &b->sets[0].end) < 0) + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) goto err; if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); out: mempool_free(iter, b->c->fill_iter); return; err: set_btree_node_io_error(b); - bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", err, PTR_BUCKET_NR(b->c, &b->key, 0), - index(i, b), i->keys); + bset_block_offset(b, i), i->keys); goto out; } @@ -218,7 +285,7 @@ static void btree_node_read_endio(struct bio *bio, int error) closure_put(cl); } -void bch_btree_node_read(struct btree *b) +static void bch_btree_node_read(struct btree *b) { uint64_t start_time = local_clock(); struct closure cl; @@ -230,11 +297,11 @@ void bch_btree_node_read(struct btree *b) bio = bch_bbio_alloc(b->c); bio->bi_rw = REQ_META|READ_SYNC; - bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bch_bio_map(bio, b->sets[0].data); + bch_bio_map(bio, b->keys.set[0].data); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -248,14 +315,11 @@ void bch_btree_node_read(struct btree *b) goto err; bch_btree_node_read_done(b); - - spin_lock(&b->c->btree_read_time_lock); bch_time_stats_update(&b->c->btree_read_time, start_time); - spin_unlock(&b->c->btree_read_time_lock); return; err: - bch_cache_set_error(b->c, "io error reading bucket %lu", + bch_cache_set_error(b->c, "io error reading bucket %zu", PTR_BUCKET_NR(b->c, &b->key, 0)); } @@ -274,9 +338,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + static void __btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -284,19 +355,18 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); + schedule_delayed_work(&b->work, 30 * HZ); - closure_return(cl); + closure_return_with_destructor(cl, btree_node_write_unlock); } static void btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct bio_vec *bv; int n; - __bio_for_each_segment(bv, b->bio, n, 0) + bio_for_each_segment_all(bv, b->bio, n) __free_page(bv->bv_page); __btree_node_write_done(cl); @@ -305,7 +375,7 @@ static void btree_node_write_done(struct closure *cl) static void btree_node_write_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); if (error) set_btree_node_io_error(b); @@ -316,8 +386,8 @@ static void btree_node_write_endio(struct bio *bio, int error) static void do_btree_node_write(struct btree *b) { - struct closure *cl = &b->io.cl; - struct bset *i = b->sets[b->nsets].data; + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); BKEY_PADDED(key) k; i->version = BCACHE_BSET_VERSION; @@ -327,9 +397,9 @@ static void do_btree_node_write(struct btree *b) b->bio = bch_bbio_alloc(b->c); b->bio->bi_end_io = btree_node_write_endio; - b->bio->bi_private = &b->io.cl; + b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; - b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); bch_bio_map(b->bio, i); /* @@ -348,14 +418,15 @@ static void do_btree_node_write(struct btree *b) */ bkey_copy(&k.key, &b->key); - SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); - bio_for_each_segment(bv, b->bio, j) + bio_for_each_segment_all(bv, b->bio, j) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); @@ -369,75 +440,106 @@ static void do_btree_node_write(struct btree *b) bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_node_write_done(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); } } -void bch_btree_node_write(struct btree *b, struct closure *parent) +void __bch_btree_node_write(struct btree *b, struct closure *parent) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); + + lockdep_assert_held(&b->write_lock); trace_bcache_btree_write(b); BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); - BUG_ON(b->sets->data->seq != i->seq); - bch_check_key_order(b, i); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); cancel_delayed_work(&b->work); /* If caller isn't waiting for write, parent refcount is cache set */ - closure_lock(&b->io, parent ?: &b->c->cl); + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); do_btree_node_write(b); - b->written += set_blocks(i, b->c); - atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - bch_btree_sort_lazy(b); + b->written += set_blocks(i, block_bytes(b->c)); +} - if (b->written < btree_blocks(b)) - bch_bset_init_next(b); +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (nsets && !b->keys.nsets) + bch_btree_verify(b); + + bch_btree_init_next(b); +} + +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); } static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } -static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); + lockdep_assert_held(&b->write_lock); + BUG_ON(!b->written); BUG_ON(!i->keys); if (!btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + schedule_delayed_work(&b->work, 30 * HZ); set_btree_node_dirty(b); - if (op && op->journal) { + if (journal_ref) { if (w->journal && - journal_pin_cmp(b->c, w, op)) { + journal_pin_cmp(b->c, w->journal, journal_ref)) { atomic_dec_bug(w->journal); w->journal = NULL; } if (!w->journal) { - w->journal = op->journal; + w->journal = journal_ref; atomic_inc(w->journal); } } @@ -453,53 +555,19 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) * mca -> memory cache */ -static void mca_reinit(struct btree *b) -{ - unsigned i; - - b->flags = 0; - b->written = 0; - b->nsets = 0; - - for (i = 0; i < MAX_BSETS; i++) - b->sets[i].size = 0; - /* - * Second loop starts at 1 because b->sets[0]->data is the memory we - * allocated - */ - for (i = 1; i < MAX_BSETS; i++) - b->sets[i].data = NULL; -} - #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ - max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) static void mca_data_free(struct btree *b) { - struct bset_tree *t = b->sets; - BUG_ON(!closure_is_unlocked(&b->io.cl)); - - if (bset_prev_bytes(b) < PAGE_SIZE) - kfree(t->prev); - else - free_pages((unsigned long) t->prev, - get_order(bset_prev_bytes(b))); - - if (bset_tree_bytes(b) < PAGE_SIZE) - kfree(t->tree); - else - free_pages((unsigned long) t->tree, - get_order(bset_tree_bytes(b))); + BUG_ON(b->io_mutex.count != 1); - free_pages((unsigned long) t->data, b->page_order); + bch_btree_keys_free(&b->keys); - t->prev = NULL; - t->tree = NULL; - t->data = NULL; + b->c->btree_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); - b->c->bucket_cache_used--; } static void mca_bucket_free(struct btree *b) @@ -518,34 +586,16 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { - struct bset_tree *t = b->sets; - BUG_ON(t->data); - - b->page_order = max_t(unsigned, - ilog2(b->c->btree_pages), - btree_order(k)); - - t->data = (void *) __get_free_pages(gfp, b->page_order); - if (!t->data) - goto err; - - t->tree = bset_tree_bytes(b) < PAGE_SIZE - ? kmalloc(bset_tree_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); - if (!t->tree) - goto err; - - t->prev = bset_prev_bytes(b) < PAGE_SIZE - ? kmalloc(bset_prev_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); - if (!t->prev) - goto err; - - list_move(&b->list, &b->c->btree_cache); - b->c->bucket_cache_used++; - return; -err: - mca_data_free(b); + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } } static struct btree *mca_bucket_alloc(struct cache_set *c, @@ -557,44 +607,56 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; - closure_init_unlocked(&b->io); + sema_init(&b->io_mutex, 1); mca_data_alloc(b, k, gfp); return b; } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush) { + struct closure cl; + + closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; - if (b->page_order < min_order) { - rw_unlock(true, b); - return -ENOMEM; - } + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + if (b->keys.page_order < min_order) + goto out_unlock; - if (cl && btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - - if (cl) - closure_wait_event_async(&b->io.wait, cl, - atomic_read(&b->io.cl.remaining) == -1); + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; - if (btree_node_dirty(b) || - !closure_is_unlocked(&b->io.cl) || - work_pending(&b->work.work)) { - rw_unlock(true, b); - return -EAGAIN; + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); } + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); + + /* wait for any in flight btree write */ + down(&b->io_mutex); + up(&b->io_mutex); + return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; } static unsigned long bch_mca_scan(struct shrinker *shrink, @@ -608,11 +670,11 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, if (c->shrinker_disabled) return SHRINK_STOP; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_WAIT) + if (sc->gfp_mask & __GFP_IO) mutex_lock(&c->bucket_lock); else if (!mutex_trylock(&c->bucket_lock)) return -1; @@ -633,26 +695,22 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } } - /* - * Can happen right when we first start up, before we've read in any - * btree nodes - */ - if (list_empty(&c->btree_cache)) - goto out; + for (i = 0; (nr--) && i < c->btree_cache_used; i++) { + if (list_empty(&c->btree_cache)) + goto out; - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { b = list_first_entry(&c->btree_cache, struct btree, list); list_rotate_left(&c->btree_cache); if (!b->accessed && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); @@ -673,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, if (c->shrinker_disabled) return 0; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return 0; return mca_can_free(c) * c->btree_pages; @@ -693,6 +751,8 @@ void bch_btree_cache_free(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); #endif list_splice(&c->btree_cache_freeable, @@ -723,12 +783,9 @@ int bch_btree_cache_alloc(struct cache_set *c) { unsigned i; - /* XXX: doesn't check for errors */ - - closure_init_unlocked(&c->gc); - for (i = 0; i < mca_reserve(c); i++) - mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; list_splice_init(&c->btree_cache, &c->btree_cache_freeable); @@ -736,10 +793,13 @@ int bch_btree_cache_alloc(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG mutex_init(&c->verify_lock); + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && - c->verify_data->sets[0].data) + c->verify_data->keys.set->data) list_del_init(&c->verify_data->list); else c->verify_data = NULL; @@ -775,52 +835,41 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - int ret = -ENOMEM; - struct btree *i; + struct task_struct *old; - trace_bcache_btree_cache_cannibalize(c); + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old && old != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + return -EINTR; + } + + return 0; +} - if (!cl) - return ERR_PTR(-ENOMEM); +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) +{ + struct btree *b; - /* - * Trying to free up some memory - i.e. reuse some btree nodes - may - * require initiating IO to flush the dirty part of the node. If we're - * running under generic_make_request(), that IO will never finish and - * we would deadlock. Returning -EAGAIN causes the cache lookup code to - * punt to workqueue and retry. - */ - if (current->bio_list) - return ERR_PTR(-EAGAIN); + trace_bcache_btree_cache_cannibalize(c); - if (c->try_harder && c->try_harder != cl) { - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); - return ERR_PTR(-EAGAIN); - } + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); - c->try_harder = cl; - c->try_harder_start = local_clock(); -retry: - list_for_each_entry_reverse(i, &c->btree_cache, list) { - int r = mca_reap(i, cl, btree_order(k)); - if (!r) - return i; - if (r != -ENOMEM) - ret = r; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; - if (ret == -EAGAIN && - closure_blocking(cl)) { - mutex_unlock(&c->bucket_lock); - closure_sync(cl); - mutex_lock(&c->bucket_lock); - goto retry; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; - return ERR_PTR(ret); + WARN(1, "btree cache cannibalize failed\n"); + return ERR_PTR(-ENOMEM); } /* @@ -829,20 +878,21 @@ retry: * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +static void bch_cannibalize_unlock(struct cache_set *c) { - if (c->try_harder == cl) { - bch_time_stats_update(&c->try_harder_time, c->try_harder_start); - c->try_harder = NULL; - __closure_wake_up(&c->try_wait); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) { struct btree *b; + BUG_ON(current->bio_list); + lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) @@ -852,16 +902,16 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap(b, NULL, btree_order(k))) + if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap(b, NULL, 0)) { + if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); - if (!b->sets[0].data) + if (!b->keys.set[0].data) goto err; else goto out; @@ -872,10 +922,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, goto err; BUG_ON(!down_write_trylock(&b->lock)); - if (!b->sets->data) + if (!b->keys.set->data) goto err; out: - BUG_ON(!closure_is_unlocked(&b->io.cl)); + BUG_ON(b->io_mutex.count != 1); bkey_copy(&b->key, k); list_move(&b->list, &c->btree_cache); @@ -883,16 +933,24 @@ out: hlist_add_head_rcu(&b->hash, mca_hash(c, k)); lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; b->level = level; - mca_reinit(b); + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); return b; err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k, level, cl); + b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out; @@ -903,17 +961,15 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary, it uses the closure embedded in struct btree_op to wait; - * if that closure is in non blocking mode, will return -EAGAIN. + * If IO is necessary and running under generic_make_request, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. */ -struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, struct btree_op *op) +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write) { int i = 0; - bool write = level <= op->lock; struct btree *b; BUG_ON(level < 0); @@ -925,7 +981,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, &op->cl); + b = mca_alloc(c, op, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -948,13 +1004,13 @@ retry: b->accessed = 1; - for (; i <= b->nsets && b->sets[i].size; i++) { - prefetch(b->sets[i].tree); - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); } - for (; i <= b->nsets; i++) - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); if (btree_node_io_error(b)) { rw_unlock(write, b); @@ -971,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, NULL); + b = mca_alloc(c, NULL, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -982,65 +1038,54 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b) { - unsigned i; - trace_bcache_btree_node_free(b); - /* - * The BUG_ON() in btree_node_get() implies that we must have a write - * lock on parent to free or even invalidate a node - */ - BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); + mutex_unlock(&b->write_lock); + cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); - - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), - PTR_BUCKET(b->c, &b->key, i)); - } - bch_bucket_free(b->c, &b->key); mca_bucket_free(b); mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, - struct closure *cl) +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) goto err; + bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level, cl); + b = mca_alloc(c, op, &k.key, level); if (IS_ERR(b)) goto err_free; if (!b) { cache_bug(c, "Tried to allocate bucket that was in btree cache"); - __bkey_put(c, &k.key); goto retry; } b->accessed = 1; - bch_bset_init_next(b); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1048,7 +1093,6 @@ retry: return b; err_free: bch_bucket_free(c, &k.key); - __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); @@ -1057,18 +1101,64 @@ err: } static struct btree *btree_node_alloc_replacement(struct btree *b, - struct closure *cl) + struct btree_op *op) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); - if (!IS_ERR_OR_NULL(n)) - bch_btree_sort_into(b, n); + struct btree *n = bch_btree_node_alloc(b->c, op, b->level); + if (!IS_ERR_OR_NULL(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); + } return n; } +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned i; + + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i))); + + mutex_unlock(&b->c->bucket_lock); +} + +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca; + unsigned i, reserve = (c->root->level - b->level) * 2 + 1; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } + + mutex_unlock(&c->bucket_lock); + + return mca_cannibalize_lock(b->c, op); +} + /* Garbage collection */ -uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) { uint8_t stale = 0; unsigned i; @@ -1088,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) g = PTR_BUCKET(c, k, i); - if (gen_after(g->gc_gen, PTR_GEN(k, i))) - g->gc_gen = PTR_GEN(k, i); + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); if (ptr_stale(c, k, i)) { stale = max(stale, ptr_stale(c, k, i)); @@ -1105,11 +1195,13 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) SET_GC_MARK(g, GC_MARK_METADATA); else if (KEY_DIRTY(k)) SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned, GC_SECTORS_USED(g) + KEY_SIZE(k), - (1 << 14) - 1)); + MAX_GC_SECTORS_USED)); BUG_ON(!GC_SECTORS_USED(g)); } @@ -1119,120 +1211,143 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) -static int btree_gc_mark_node(struct btree *b, unsigned *keys, - struct gc_stat *gc) +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned last_dev = -1; - struct bcache_device *d = NULL; + unsigned keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; gc->nodes++; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - if (last_dev != KEY_INODE(k)) { - last_dev = KEY_INODE(k); - - d = KEY_INODE(k) < b->c->nr_uuids - ? b->c->devices[last_dev] - : NULL; - } - + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { stale = max(stale, btree_mark_key(b, k)); + keys++; - if (bch_ptr_bad(b, k)) + if (bch_ptr_bad(&b->keys, k)) continue; - *keys += bkey_u64s(k); - gc->key_bytes += bkey_u64s(k); gc->nkeys++; + good_keys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) - gc->dirty += KEY_SIZE(k); } - for (t = b->sets; t <= &b->sets[b->nsets]; t++) + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) btree_bug_on(t->size && - bset_written(b, t) && + bset_written(&b->keys, t) && bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); - return stale; -} - -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, - struct btree_op *op) -{ - /* - * We block priorities from being written for the duration of garbage - * collection, so we can't sleep in btree_alloc() -> - * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it - * our closure. - */ - struct btree *n = btree_node_alloc_replacement(b, NULL); - - if (!IS_ERR_OR_NULL(n)) { - swap(b, n); - __bkey_put(b->c, &b->key); + if (b->c->gc_always_rewrite) + return true; - memcpy(k->ptr, b->key.ptr, - sizeof(uint64_t) * KEY_PTRS(&b->key)); + if (stale > 10) + return true; - btree_node_free(n, op); - up_write(&n->lock); - } + if ((keys - good_keys) * 2 > keys) + return true; - return b; + return false; } -/* - * Leaving this at 2 until we've got incremental garbage collection done; it - * could be higher (and has been tested with 4) except that garbage collection - * could take much longer, adversely affecting latency. - */ -#define GC_MERGE_NODES 2U +#define GC_MERGE_NODES 4U struct gc_merge_info { struct btree *b; - struct bkey *k; unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct gc_stat *gc, struct gc_merge_info *r) +static int bch_btree_insert_node(struct btree *, struct btree_op *, + struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) { - unsigned nodes = 0, keys = 0, blocks; - int i; + unsigned i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; + struct closure cl; + struct bkey *k; + + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && r[nodes].b) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || - __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) - return; - - for (i = nodes - 1; i >= 0; --i) { - if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c)) > blocks * (nodes - 1)) + return 0; - if (r[i].b->written) - return; + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); + if (IS_ERR_OR_NULL(new_nodes[i])) + goto out_nocoalesce; } + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = r[i].b->sets->data; - struct bset *n2 = r[i - 1].b->sets->data; + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL; keys = 0; - if (i == 1) { + if (i > 1) { + for (k = n2->start; + k < bset_bkey_last(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), + block_bytes(b->c)) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of @@ -1241,133 +1356,226 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, * length keys (shouldn't be possible in practice, * though) */ - if (__set_blocks(n1, n1->keys + r->keys, - b->c) > btree_blocks(r[i].b)) - return; + if (__set_blocks(n1, n1->keys + n2->keys, + block_bytes(b->c)) > + btree_blocks(new_nodes[i])) + goto out_nocoalesce; keys = n2->keys; + /* Take the key of the node we're getting rid of */ last = &r->b->key; - } else - for (k = n2->start; - k < end(n2); - k = bkey_next(k)) { - if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) - break; - - last = k; - keys += bkey_u64s(k); - } + } - BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(r[i].b)); + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + btree_blocks(new_nodes[i])); - if (last) { - bkey_copy_key(&r[i].b->key, last); - bkey_copy_key(r[i].k, last); - } + if (last) + bkey_copy_key(&new_nodes[i]->key, last); - memcpy(end(n1), + memcpy(bset_bkey_last(n1), n2->start, - (void *) node(n2, keys) - (void *) n2->start); + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); n1->keys += keys; + r[i].keys = n1->keys; memmove(n2->start, - node(n2, keys), - (void *) end(n2) - (void *) node(n2, keys)); + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); n2->keys -= keys; - r[i].keys = n1->keys; - r[i - 1].keys = n2->keys; + if (__bch_keylist_realloc(&keylist, + bkey_u64s(&new_nodes[i]->key))) + goto out_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(&keylist, &new_nodes[i]->key); } - btree_node_free(r->b, op); - up_write(&r->b->lock); + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); - trace_bcache_btree_gc_coalesce(nodes); + closure_sync(&cl); + + /* We emptied out this node */ + BUG_ON(btree_bset_first(new_nodes[0])->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + for (i = 0; i < nodes; i++) { + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) + goto out_nocoalesce; + + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); + } + + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; - nodes--; - memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); - memset(&r[nodes], 0, sizeof(struct gc_merge_info)); + bch_keylist_free(&keylist); + + /* Invalidated our iterator */ + return -EINTR; + +out_nocoalesce: + closure_sync(&cl); + bch_keylist_free(&keylist); + + while ((k = bch_keylist_pop(&keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; } -static int btree_gc_recurse(struct btree *b, struct btree_op *op, - struct closure *writes, struct gc_stat *gc) +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) { - void write(struct btree *r) - { - if (!r->written) - bch_btree_node_write(r, &op->cl); - else if (btree_node_dirty(r)) - bch_btree_node_write(r, writes); + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; - up_write(&r->lock); + n = btree_node_alloc_replacement(replace, NULL); + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; } - int ret = 0, stale; - unsigned i; - struct gc_merge_info r[GC_MERGE_NODES]; + bch_btree_node_write_sync(n); - memset(r, 0, sizeof(r)); + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); - while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); - if (IS_ERR(r->b)) { - ret = PTR_ERR(r->b); - break; - } + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); - r->keys = 0; - stale = btree_gc_mark_node(r->b, &r->keys, gc); + btree_node_free(replace); + rw_unlock(true, n); - if (!b->written && - (r->b->level || stale > 10 || - b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k, op); + /* Invalidated our iterator */ + return -EINTR; +} + +static unsigned btree_gc_count_keys(struct btree *b) +{ + struct bkey *k; + struct btree_iter iter; + unsigned ret = 0; - if (r->b->level) - ret = btree_gc_recurse(r->b, op, writes, gc); + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); - if (ret) { - write(r->b); - break; + return ret; +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + int ret = 0; + bool should_rewrite; + struct bkey *k; + struct btree_iter iter; + struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; + + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); + + while (1) { + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, gc, r); + if (ret) + break; } - bkey_copy_key(&b->c->gc_done, r->k); + if (!last->b) + break; - if (!b->written) - btree_gc_coalesce(b, op, gc, r); + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) + break; + } - if (r[GC_MERGE_NODES - 1].b) - write(r[GC_MERGE_NODES - 1].b); + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } - memmove(&r[1], &r[0], - sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + mutex_lock(&last->b->write_lock); + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; - /* When we've got incremental GC working, we'll want to do - * if (should_resched()) - * return -EAGAIN; - */ - cond_resched(); -#if 0 if (need_resched()) { ret = -EAGAIN; break; } -#endif } - for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) - write(r[i].b); - - /* Might have freed some children, must remove their keys */ - if (!b->written) - bch_btree_sort(b); + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); + } return ret; } @@ -1376,29 +1584,34 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { struct btree *n = NULL; - unsigned keys = 0; - int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); + int ret = 0; + bool should_rewrite; - if (b->level || stale > 10) + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) - swap(b, n); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); - if (b->level) - ret = btree_gc_recurse(b, op, writes, gc); + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); - if (!b->written || btree_node_dirty(b)) { - bch_btree_node_write(b, n ? &op->cl : NULL); + return -EINTR; + } } - if (!IS_ERR_OR_NULL(n)) { - closure_sync(&op->cl); - bch_btree_set_root(b); - btree_node_free(n, op); - rw_unlock(true, b); + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; } + bkey_copy_key(&b->c->gc_done, &b->key); + return ret; } @@ -1418,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { - b->gc_gen = b->gen; + b->last_gc = b->gen; if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } @@ -1428,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -size_t bch_btree_gc_finish(struct cache_set *c) +static size_t bch_btree_gc_finish(struct cache_set *c) { size_t available = 0; struct bucket *b; @@ -1441,15 +1654,32 @@ size_t bch_btree_gc_finish(struct cache_set *c) c->gc_mark_valid = 1; c->need_gc = 0; - if (c->root) - for (i = 0; i < KEY_PTRS(&c->root->key); i++) - SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), - GC_MARK_METADATA); - for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + unsigned j; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + for_each_cache(ca, c, i) { uint64_t *i; @@ -1463,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); for_each_bucket(b, ca) { - b->last_gc = b->gc_gen; c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (!atomic_read(&b->pin) && - GC_MARK(b) == GC_MARK_RECLAIMABLE) { + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) available++; - if (!GC_SECTORS_USED(b)) - bch_bucket_add_unused(ca, b); - } } } @@ -1479,9 +1709,8 @@ size_t bch_btree_gc_finish(struct cache_set *c) return available; } -static void bch_btree_gc(struct closure *cl) +static void bch_btree_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); int ret; unsigned long available; struct gc_stat stats; @@ -1493,632 +1722,505 @@ static void bch_btree_gc(struct closure *cl) memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + bch_btree_op_init(&op, SHRT_MAX); btree_gc_start(c); - atomic_inc(&c->prio_blocked); - - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&op.cl); - closure_sync(&writes); - - if (ret) { - pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); - } + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); - /* Possibly wait for new UUIDs or whatever to hit disk */ - bch_journal_meta(c, &op.cl); - closure_sync(&op.cl); + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); + } while (ret); available = bch_btree_gc_finish(c); - - atomic_dec(&c->prio_blocked); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); - stats.dirty <<= 9; stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); - continue_at(cl, bch_moving_gc, bch_gc_wq); + bch_moving_gc(c); } -void bch_queue_gc(struct cache_set *c) -{ - closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); -} - -/* Initial partial gc */ - -static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, - unsigned long **seen) +static int bch_gc_thread(void *arg) { - int ret; + struct cache_set *c = arg; + struct cache *ca; unsigned i; - struct bkey *k; - struct bucket *g; - struct btree_iter iter; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - for (i = 0; i < KEY_PTRS(k); i++) { - if (!ptr_available(b->c, k, i)) - continue; + while (1) { +again: + bch_btree_gc(c); - g = PTR_BUCKET(b->c, k, i); + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; - if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), - seen[PTR_DEV(k, i)]) || - !ptr_stale(b->c, k, i)) { - g->gen = PTR_GEN(k, i); + mutex_lock(&c->bucket_lock); - if (b->level) - g->prio = BTREE_PRIO; - else if (g->prio == BTREE_PRIO) - g->prio = INITIAL_PRIO; + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) { + mutex_unlock(&c->bucket_lock); + set_current_state(TASK_RUNNING); + goto again; } - } - - btree_mark_key(b, k); - } - - if (b->level) { - k = bch_next_recurse_key(b, &ZERO_KEY); - while (k) { - struct bkey *p = bch_next_recurse_key(b, k); - if (p) - btree_node_prefetch(b->c, p, b->level - 1); - - ret = btree(check_recurse, k, b, op, seen); - if (ret) - return ret; + mutex_unlock(&c->bucket_lock); - k = p; - } + try_to_freeze(); + schedule(); } return 0; } -int bch_btree_check(struct cache_set *c, struct btree_op *op) +int bch_gc_thread_start(struct cache_set *c) { - int ret = -ENOMEM; - unsigned i; - unsigned long *seen[MAX_CACHES_PER_SET]; + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(c->gc_thread)) + return PTR_ERR(c->gc_thread); - memset(seen, 0, sizeof(seen)); - - for (i = 0; c->cache[i]; i++) { - size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); - seen[i] = kmalloc(n, GFP_KERNEL); - if (!seen[i]) - goto err; - - /* Disables the seen array until prio_read() uses it too */ - memset(seen[i], 0xFF, n); - } - - ret = btree_root(check_recurse, c, op, seen); -err: - for (i = 0; i < MAX_CACHES_PER_SET; i++) - kfree(seen[i]); - return ret; + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); + return 0; } -/* Btree insertion */ - -static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) -{ - struct bset *i = b->sets[b->nsets].data; - - memmove((uint64_t *) where + bkey_u64s(insert), - where, - (void *) end(i) - (void *) where); - - i->keys += bkey_u64s(insert); - bkey_copy(where, insert); - bch_bset_fix_lookup_table(b, where); -} +/* Initial partial gc */ -static bool fix_overlapping_extents(struct btree *b, - struct bkey *insert, - struct btree_iter *iter, - struct btree_op *op) +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) - { - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - offset, -sectors); - } - - uint64_t old_offset; - unsigned old_size, sectors_found = 0; - - while (1) { - struct bkey *k = bch_btree_iter_next(iter); - if (!k || - bkey_cmp(&START_KEY(k), insert) >= 0) - break; + int ret = 0; + struct bkey *k, *p = NULL; + struct btree_iter iter; - if (bkey_cmp(k, &START_KEY(insert)) <= 0) - continue; + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); - old_offset = KEY_START(k); - old_size = KEY_SIZE(k); + bch_initial_mark_key(b->c, b->level + 1, &b->key); - /* - * We might overlap with 0 size extents; we can't skip these - * because if they're in the set we're inserting to we have to - * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for BTREE_REPLACE - * operations. - */ + if (b->level) { + bch_btree_iter_init(&b->keys, &iter, NULL); - if (op->type == BTREE_REPLACE && - KEY_SIZE(k)) { - /* - * k might have been split since we inserted/found the - * key we're replacing - */ - unsigned i; - uint64_t offset = KEY_START(k) - - KEY_START(&op->replace); + do { + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); + if (k) + btree_node_prefetch(b->c, k, b->level - 1); - /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(&op->replace) || - KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) - goto check_failed; + if (p) + ret = btree(check_recurse, p, b, op); - /* We didn't find a key that we were supposed to */ - if (KEY_START(k) > KEY_START(insert) + sectors_found) - goto check_failed; + p = k; + } while (p && !ret); + } - if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) - goto check_failed; + return ret; +} - /* skip past gen */ - offset <<= 8; +int bch_btree_check(struct cache_set *c) +{ + struct btree_op op; - BUG_ON(!KEY_PTRS(&op->replace)); + bch_btree_op_init(&op, SHRT_MAX); - for (i = 0; i < KEY_PTRS(&op->replace); i++) - if (k->ptr[i] != op->replace.ptr[i] + offset) - goto check_failed; + return btree_root(check_recurse, c, &op); +} - sectors_found = KEY_OFFSET(k) - KEY_START(insert); - } +void bch_initial_gc_finish(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + unsigned i; - if (bkey_cmp(insert, k) < 0 && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { - /* - * We overlapped in the middle of an existing key: that - * means we have to split the old key. But we have to do - * slightly different things depending on whether the - * old key has been written out yet. - */ + bch_btree_gc_finish(c); - struct bkey *top; - - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - - if (bkey_written(b, k)) { - /* - * We insert a new key to cover the top of the - * old key, and the old key is modified in place - * to represent the bottom split. - * - * It's completely arbitrary whether the new key - * is the top or the bottom, but it has to match - * up with what btree_sort_fixup() does - it - * doesn't check for this kind of overlap, it - * depends on us inserting a new key for the top - * here. - */ - top = bch_bset_search(b, &b->sets[b->nsets], - insert); - shift_keys(b, top, k); - } else { - BKEY_PADDED(key) temp; - bkey_copy(&temp.key, k); - shift_keys(b, k, &temp.key); - top = bkey_next(k); - } + mutex_lock(&c->bucket_lock); - bch_cut_front(insert, top); - bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); - return false; - } + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_cache(ca, c, i) { + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO])) + break; - if (bkey_cmp(insert, k) < 0) { - bch_cut_front(insert, k); - } else { - if (bkey_written(b, k) && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { - /* - * Completely overwrote, so we don't have to - * invalidate the binary search tree - */ - bch_cut_front(k, k); - } else { - __bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets); } } - - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); } -check_failed: - if (op->type == BTREE_REPLACE) { - if (!sectors_found) { - op->insert_collision = true; - return true; - } else if (sectors_found < KEY_SIZE(insert)) { - SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - - (KEY_SIZE(insert) - sectors_found)); - SET_KEY_SIZE(insert, sectors_found); - } - } - - return false; + mutex_unlock(&c->bucket_lock); } -static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k) +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) { - struct bset *i = b->sets[b->nsets].data; - struct bkey *m, *prev; - unsigned status = BTREE_INSERT_STATUS_INSERT; + unsigned status; BUG_ON(bkey_cmp(k, &b->key) > 0); - BUG_ON(b->level && !KEY_PTRS(k)); - BUG_ON(!b->level && !KEY_OFFSET(k)); - if (!b->level) { - struct btree_iter iter; - struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); - /* - * bset_search() returns the first key that is strictly greater - * than the search key - but for back merging, we want to find - * the first key that is greater than or equal to KEY_START(k) - - * unless KEY_START(k) is 0. - */ - if (KEY_OFFSET(&search)) - SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); - - prev = NULL; - m = bch_btree_iter_init(b, &iter, &search); - - if (fix_overlapping_extents(b, k, &iter, op)) - return false; - - while (m != end(i) && - bkey_cmp(k, &START_KEY(m)) > 0) - prev = m, m = bkey_next(m); - - if (key_merging_disabled(b->c)) - goto insert; - - /* prev is in the tree, if we merge we're done */ - status = BTREE_INSERT_STATUS_BACK_MERGE; - if (prev && - bch_bkey_try_merge(b, prev, k)) - goto merged; - - status = BTREE_INSERT_STATUS_OVERWROTE; - if (m != end(i) && - KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) - goto copy; - - status = BTREE_INSERT_STATUS_FRONT_MERGE; - if (m != end(i) && - bch_bkey_try_merge(b, k, m)) - goto copy; + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; } else - m = bch_bset_search(b, &b->sets[b->nsets], k); - -insert: shift_keys(b, m, k); -copy: bkey_copy(m, k); -merged: - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - - bch_check_keys(b, "%u for %s", status, op_type(op)); - - if (b->level && !KEY_OFFSET(k)) - btree_current_write(b)->prio_blocked++; - - trace_bcache_btree_insert_key(b, k, op->type, status); - - return true; + return false; } -static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static size_t insert_u64s_remaining(struct btree *b) { - bool ret = false; - struct bkey *k; - unsigned oldsize = bch_count_data(b); + long ret = bch_btree_keys_u64s_remaining(&b->keys); - while ((k = bch_keylist_pop(&op->keys))) { - bkey_put(b->c, k, b->level); - ret |= btree_insert_key(b, op, k); - } + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; - BUG_ON(bch_count_data(b) < oldsize); - return ret; + return max(ret, 0L); } -bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, - struct bio *bio) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) { bool ret = false; - uint64_t btree_ptr = b->key.ptr[0]; - unsigned long seq = b->seq; - BKEY_PADDED(k) tmp; + int oldsize = bch_count_data(&b->keys); - rw_unlock(false, b); - rw_lock(true, b, b->level); + while (!bch_keylist_empty(insert_keys)) { + struct bkey *k = insert_keys->keys; - if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1 || - should_split(b)) - goto out; + if (bkey_u64s(k) > insert_u64s_remaining(b)) + break; + + if (bkey_cmp(k, &b->key) <= 0) { + if (!b->level) + bkey_put(b->c, k); + + ret |= btree_insert_key(b, k, replace_key); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->keys); - op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->keys); - SET_KEY_PTRS(&op->replace, 1); - get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); + ret |= btree_insert_key(b, &temp.key, replace_key); + break; + } else { + break; + } + } - SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); + if (!ret) + op->insert_collision = true; - bkey_copy(&tmp.k, &op->replace); + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); - BUG_ON(op->type != BTREE_INSERT); - BUG_ON(!btree_insert_key(b, op, &tmp.k)); - ret = true; -out: - downgrade_write(&b->lock); + BUG_ON(bch_count_data(&b->keys) < oldsize); return ret; } -static int btree_split(struct btree *b, struct btree_op *op) +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) { - bool split, root = b == b->c->root; + bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); + struct closure cl; + struct keylist parent_keys; - if (b->level) - set_closure_blocking(&op->cl); + closure_init_stack(&cl); + bch_keylist_init(&parent_keys); - n1 = btree_node_alloc_replacement(b, &op->cl); + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } + + n1 = btree_node_alloc_replacement(b, op); if (IS_ERR(n1)) goto err; - split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned keys = 0; - trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); - n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); + n2 = bch_btree_node_alloc(b->c, op, b->level); if (IS_ERR(n2)) goto err_free1; - if (root) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); + if (!b->parent) { + n3 = bch_btree_node_alloc(b->c, op, b->level + 1); if (IS_ERR(n3)) goto err_free2; } - bch_btree_insert_keys(n1, op); + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + + bch_btree_insert_keys(n1, op, insert_keys, replace_key); - /* Has to be a linear search because we don't have an auxiliary + /* + * Has to be a linear search because we don't have an auxiliary * search tree yet */ - while (keys < (n1->sets[0].data->keys * 3) / 5) - keys += bkey_u64s(node(n1->sets[0].data, keys)); + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); - bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); - keys += bkey_u64s(node(n1->sets[0].data, keys)); + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); - n2->sets[0].data->keys = n1->sets[0].data->keys - keys; - n1->sets[0].data->keys = keys; + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; - memcpy(n2->sets[0].data->start, - end(n1->sets[0].data), - n2->sets[0].data->keys * sizeof(uint64_t)); + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); - bch_keylist_add(&op->keys, &n2->key); - bch_btree_node_write(n2, &op->cl); + bch_keylist_add(&parent_keys, &n2->key); + bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); rw_unlock(true, n2); } else { - trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); - bch_btree_insert_keys(n1, op); + mutex_lock(&n1->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); } - bch_keylist_add(&op->keys, &n1->key); - bch_btree_node_write(n1, &op->cl); + bch_keylist_add(&parent_keys, &n1->key); + bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); if (n3) { + /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); bkey_copy_key(&n3->key, &MAX_KEY); - bch_btree_insert_keys(n3, op); - bch_btree_node_write(n3, &op->cl); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); + bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); - closure_sync(&op->cl); + closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); - } else if (root) { - op->keys.top = op->keys.bottom; - closure_sync(&op->cl); + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + closure_sync(&cl); bch_btree_set_root(n1); } else { - unsigned i; - - bkey_copy(op->keys.top, &b->key); - bkey_copy_key(op->keys.top, &ZERO_KEY); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; + /* Split a non root node */ + closure_sync(&cl); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); - SET_PTR_GEN(op->keys.top, i, g); - } - - bch_keylist_push(&op->keys); - closure_sync(&op->cl); - atomic_inc(&b->c->prio_blocked); + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); } + btree_node_free(b); rw_unlock(true, n1); - btree_node_free(b, op); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: - __bkey_put(n2->c, &n2->key); - btree_node_free(n2, op); + bkey_put(b->c, &n2->key); + btree_node_free(n2); rw_unlock(true, n2); err_free1: - __bkey_put(n1->c, &n1->key); - btree_node_free(n1, op); + bkey_put(b->c, &n1->key); + btree_node_free(n1); rw_unlock(true, n1); err: + WARN(1, "bcache: btree split failed (level %u)", b->level); + if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || n1 == ERR_PTR(-EAGAIN)) return -EAGAIN; - pr_warn("couldn't split"); return -ENOMEM; } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, - struct keylist *stack_keys) +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key) { - if (b->level) { - int ret; - struct bkey *insert = op->keys.bottom; - struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); + struct closure cl; - if (!k) { - btree_bug(b, "no key to recurse on at level %i/%i", - b->level, b->c->root->level); + BUG_ON(b->level && replace_key); - op->keys.top = op->keys.bottom; - return -EIO; - } + closure_init_stack(&cl); - if (bkey_cmp(insert, k) > 0) { - unsigned i; + mutex_lock(&b->write_lock); - if (op->type == BTREE_REPLACE) { - __bkey_put(b->c, insert); - op->keys.top = op->keys.bottom; - op->insert_collision = true; - return 0; - } + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ - for (i = 0; i < KEY_PTRS(insert); i++) - atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { + mutex_unlock(&b->write_lock); + goto split; + } - bkey_copy(stack_keys->top, insert); + BUG_ON(write_block(b) != btree_bset_last(b)); - bch_cut_back(k, insert); - bch_cut_front(k, stack_keys->top); + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } - bch_keylist_push(stack_keys); - } + mutex_unlock(&b->write_lock); - ret = btree(insert_recurse, k, b, op, stack_keys); - if (ret) - return ret; + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; } +} - if (!bch_keylist_empty(&op->keys)) { - if (should_split(b)) { - if (op->lock <= b->c->root->level) { - BUG_ON(b->level); - op->lock = b->c->root->level + 1; - return -EINTR; - } - return btree_split(b, op); - } +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; - BUG_ON(write_block(b) != b->sets[b->nsets].data); + bch_keylist_init(&insert); - if (bch_btree_insert_keys(b, op)) { - if (!b->level) - bch_btree_leaf_dirty(b, op); - else - bch_btree_node_write(b, &op->cl); - } + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) + goto out; } - return 0; + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; } -int bch_btree_insert(struct btree_op *op, struct cache_set *c) +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { - int ret = 0; - struct keylist stack_keys; + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); - /* - * Don't want to block with the btree locked unless we have to, - * otherwise we get deadlocks with try_harder and between split/gc - */ - clear_closure_blocking(&op->cl); - - BUG_ON(bch_keylist_empty(&op->keys)); - bch_keylist_copy(&stack_keys, &op->keys); - bch_keylist_init(&op->keys); - - while (!bch_keylist_empty(&stack_keys) || - !bch_keylist_empty(&op->keys)) { - if (bch_keylist_empty(&op->keys)) { - bch_keylist_add(&op->keys, - bch_keylist_pop(&stack_keys)); - op->lock = 0; - } + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; +} - ret = btree_root(insert_recurse, c, op, &stack_keys); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) +{ + struct btree_insert_op op; + int ret = 0; - if (ret == -EAGAIN) { - ret = 0; - closure_sync(&op->cl); - } else if (ret) { - struct bkey *k; + BUG_ON(current->bio_list); + BUG_ON(bch_keylist_empty(keys)); + + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; + + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } - pr_err("error %i trying to insert key for %s", - ret, op_type(op)); + if (ret) { + struct bkey *k; - while ((k = bch_keylist_pop(&stack_keys) ?: - bch_keylist_pop(&op->keys))) - bkey_put(c, k, 0); - } - } + pr_err("error %i", ret); - bch_keylist_free(&stack_keys); + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k); + } else if (op.op.insert_collision) + ret = -ESRCH; - if (op->journal) - atomic_dec_bug(op->journal); - op->journal = NULL; return ret; } @@ -2141,132 +2243,81 @@ void bch_btree_set_root(struct btree *b) mutex_unlock(&b->c->bucket_lock); b->c->root = b; - __bkey_put(b->c, &b->key); bch_journal_meta(b->c, &cl); closure_sync(&cl); } -/* Cache lookup */ +/* Map across nodes or keys */ -static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, - struct bkey *k) +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - int ret = 0; + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; - while (!ret && - !op->lookup_done) { - unsigned sectors = INT_MAX; + bch_btree_iter_init(&b->keys, &iter, from); - if (KEY_INODE(k) == op->inode) { - if (KEY_START(k) <= bio->bi_sector) - break; + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad))) { + ret = btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; - sectors = min_t(uint64_t, sectors, - KEY_START(k) - bio->bi_sector); + if (ret != MAP_CONTINUE) + return ret; } - - ret = s->d->cache_miss(b, s, bio, sectors); } + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + return ret; } -/* - * Read from a single key, handling the initial cache miss if the key starts in - * the middle of the bio - */ -static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, - struct bkey *k) +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - unsigned ptr; - struct bio *n; - - int ret = submit_partial_cache_miss(b, op, k); - if (ret || op->lookup_done) - return ret; - - /* XXX: figure out best pointer - for multiple cache devices */ - ptr = 0; - - PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; - - while (!op->lookup_done && - KEY_INODE(k) == op->inode && - bio->bi_sector < KEY_OFFSET(k)) { - struct bkey *bio_key; - sector_t sector = PTR_OFFSET(k, ptr) + - (bio->bi_sector - KEY_START(k)); - unsigned sectors = min_t(uint64_t, INT_MAX, - KEY_OFFSET(k) - bio->bi_sector); - - n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (n == bio) - op->lookup_done = true; - - bio_key = &container_of(n, struct bbio, bio)->key; - - /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ - - bch_bkey_copy_single_ptr(bio_key, k, ptr); - SET_PTR_OFFSET(bio_key, 0, sector); - - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; - - __bch_submit_bbio(n, b->c); - } - - return 0; + return btree_root(map_nodes_recurse, c, op, from, fn, flags); } -int bch_btree_search_recurse(struct btree *b, struct btree_op *op) +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) { - struct search *s = container_of(op, struct search, op); - struct bio *bio = &s->bio.bio; - - int ret = 0; + int ret = MAP_CONTINUE; struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); - if (!k) { - /* - * b->key would be exactly what we want, except that - * pointers to btree nodes have nonzero size - we - * wouldn't go far enough - */ + bch_btree_iter_init(&b->keys, &iter, from); - ret = submit_partial_cache_miss(b, op, - &KEY(KEY_INODE(&b->key), - KEY_OFFSET(&b->key), 0)); - break; - } + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : btree(map_keys_recurse, k, b, op, from, fn, flags); + from = NULL; - ret = b->level - ? btree(search_recurse, k, b, op) - : submit_partial_cache_hit(b, op, k); - } while (!ret && - !op->lookup_done); + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); return ret; } +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + return btree_root(map_keys_recurse, c, op, from, fn, flags); +} + /* Keybuf code */ static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) @@ -2285,80 +2336,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); } -static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end, - keybuf_pred_fn *pred) -{ - struct btree_iter iter; - bch_btree_iter_init(b, &iter, &buf->last_scanned); - - while (!array_freelist_empty(&buf->freelist)) { - struct bkey *k = bch_btree_iter_next_filter(&iter, b, - bch_ptr_bad); - - if (!b->level) { - if (!k) { - buf->last_scanned = b->key; - break; - } +struct refill { + struct btree_op op; + unsigned nr_found; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; - buf->last_scanned = *k; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; - if (pred(buf, k)) { - struct keybuf_key *w; + if (bkey_cmp(k, refill->end) >= 0) { + ret = MAP_DONE; + goto out; + } - spin_lock(&buf->lock); + if (!KEY_SIZE(k)) /* end key */ + goto out; - w = array_alloc(&buf->freelist); + if (refill->pred(buf, k)) { + struct keybuf_key *w; - w->private = NULL; - bkey_copy(&w->key, k); + spin_lock(&buf->lock); - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) - array_free(&buf->freelist, w); + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } - spin_unlock(&buf->lock); - } - } else { - if (!k) - break; + w->private = NULL; + bkey_copy(&w->key, k); - btree(refill_keybuf, k, b, op, buf, end, pred); - /* - * Might get an error here, but can't really do anything - * and it'll get logged elsewhere. Just read what we - * can. - */ + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + else + refill->nr_found++; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; - cond_resched(); - } + spin_unlock(&buf->lock); } - - return 0; +out: + buf->last_scanned = *k; + return ret; } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct refill refill; cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end, pred); - closure_sync(&op.cl); + bch_btree_op_init(&refill.op, -1); + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); - pr_debug("found %s keys from %llu:%llu to %llu:%llu", - RB_EMPTY_ROOT(&buf->keys) ? "no" : - array_freelist_empty(&buf->freelist) ? "some" : "a few", - KEY_INODE(&start), KEY_OFFSET(&start), - KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); spin_lock(&buf->lock); @@ -2436,9 +2486,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) } struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, - struct keybuf *buf, - struct bkey *end, - keybuf_pred_fn *pred) + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2466,20 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } - -void bch_btree_exit(void) -{ - if (btree_io_wq) - destroy_workqueue(btree_io_wq); - if (bch_gc_wq) - destroy_workqueue(bch_gc_wq); -} - -int __init bch_btree_init(void) -{ - if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || - !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) - return -ENOMEM; - - return 0; -} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3333d372363..91dfa5e6968 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -125,24 +125,19 @@ struct btree { unsigned long seq; struct rw_semaphore lock; struct cache_set *c; + struct btree *parent; + + struct mutex write_lock; unsigned long flags; uint16_t written; /* would be nice to kill */ uint8_t level; - uint8_t nsets; - uint8_t page_order; - - /* - * Set of sorted keys - the real btree node - plus a binary search tree - * - * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree sets[MAX_BSETS]; + + struct btree_keys keys; /* For outstanding btree writes, used as a lock - protects write_idx */ - struct closure_with_waitlist io; + struct closure io; + struct semaphore io_mutex; struct list_head list; struct delayed_work work; @@ -178,42 +173,27 @@ static inline struct btree_write *btree_prev_write(struct btree *b) return b->writes + (btree_node_write_idx(b) ^ 1); } -static inline unsigned bset_offset(struct btree *b, struct bset *i) +static inline struct bset *btree_bset_first(struct btree *b) { - return (((size_t) i) - ((size_t) b->sets->data)) >> 9; + return b->keys.set->data; } -static inline struct bset *write_block(struct btree *b) +static inline struct bset *btree_bset_last(struct btree *b) { - return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); + return bset_tree_last(&b->keys)->data; } -static inline bool bset_written(struct btree *b, struct bset_tree *t) +static inline unsigned bset_block_offset(struct btree *b, struct bset *i) { - return t->data < write_block(b); -} - -static inline bool bkey_written(struct btree *b, struct bkey *k) -{ - return k < write_block(b)->start; + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - return __bch_ptr_invalid(b->c, b->level, k); -} - -static inline struct bkey *bch_btree_iter_init(struct btree *b, - struct btree_iter *iter, - struct bkey *search) -{ - return __bch_btree_iter_init(b, iter, search, b->sets); -} +void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ @@ -223,62 +203,24 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, iter++) \ hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), b, filter));) - -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) - /* Recursing down the btree */ struct btree_op { - struct closure cl; - struct cache_set *c; - - /* Journal entry we have a refcount on */ - atomic_t *journal; - - /* Bio to be inserted into the cache */ - struct bio *cache_bio; - - unsigned inode; - - uint16_t write_prio; + /* for waiting on btree reserve in btree_split() */ + wait_queue_t wait; /* Btree level at which we start taking write locks */ short lock; - /* Btree insertion type */ - enum { - BTREE_INSERT, - BTREE_REPLACE - } type:8; - - unsigned csum:1; - unsigned skip:1; - unsigned flush_journal:1; - - unsigned insert_data_done:1; - unsigned lookup_done:1; unsigned insert_collision:1; - - /* Anything after this point won't get zeroed in do_bio_hook() */ - - /* Keys to be inserted */ - struct keylist keys; - BKEY_PADDED(replace); -}; - -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, }; -void bch_btree_op_init_stack(struct btree_op *); +static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) +{ + memset(op, 0, sizeof(struct btree_op)); + init_wait(&op->wait); + op->lock = write_lock_level; +} static inline void rw_lock(bool w, struct btree *b, int level) { @@ -290,108 +232,73 @@ static inline void rw_lock(bool w, struct btree *b, int level) static inline void rw_unlock(bool w, struct btree *b) { -#ifdef CONFIG_BCACHE_EDEBUG - unsigned i; - - if (w && b->key.ptr[0]) - for (i = 0; i <= b->nsets; i++) - bch_check_key_order(b, b->sets[i].data); -#endif - if (w) b->seq++; (w ? up_write : up_read)(&b->lock); } -#define insert_lock(s, b) ((b)->level <= (s)->lock) - -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ - if (!IS_ERR(_b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - rw_unlock(_w, _b); \ - } else \ - _r = PTR_ERR(_b); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c, &(op)->cl); \ - } while (_r == -EINTR); \ - \ - _r; \ -}) +void bch_btree_node_read_done(struct btree *); +void __bch_btree_node_write(struct btree *, struct closure *); +void bch_btree_node_write(struct btree *, struct closure *); -static inline bool should_split(struct btree *b) +void bch_btree_set_root(struct btree *); +struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, + struct bkey *, int, bool); + +int bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bkey *); +int bch_btree_insert(struct cache_set *, struct keylist *, + atomic_t *, struct bkey *); + +int bch_gc_thread_start(struct cache_set *); +void bch_initial_gc_finish(struct cache_set *); +void bch_moving_gc(struct cache_set *); +int bch_btree_check(struct cache_set *); +void bch_initial_mark_key(struct cache_set *, int, struct bkey *); + +static inline void wake_up_gc(struct cache_set *c) { - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (i->seq == b->sets[0].data->seq && - b->written + __set_blocks(i, i->keys + 15, b->c) - > btree_blocks(b)); + if (c->gc_thread) + wake_up_process(c->gc_thread); } -void bch_btree_node_read(struct btree *); -void bch_btree_node_write(struct btree *, struct closure *); +#define MAP_DONE 0 +#define MAP_CONTINUE 1 -void bch_cannibalize_unlock(struct cache_set *, struct closure *); -void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, - int, struct btree_op *); +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 + +#define MAP_END_KEY 1 -bool bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bio *); -int bch_btree_insert(struct btree_op *, struct cache_set *); +typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); +int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_nodes_fn *, int); + +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} -int bch_btree_search_recurse(struct btree *, struct btree_op *); +typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, + struct bkey *); +int bch_btree_map_keys(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_keys_fn *, int); -void bch_queue_gc(struct cache_set *); -size_t bch_btree_gc_finish(struct cache_set *); -void bch_moving_gc(struct closure *); -int bch_btree_check(struct cache_set *, struct btree_op *); -uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, - keybuf_pred_fn *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9aba2017f0d..7a228de95fd 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -11,47 +11,12 @@ #include "closure.h" -void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} -EXPORT_SYMBOL_GPL(closure_queue); - -#define CL_FIELD(type, field) \ - case TYPE_ ## type: \ - return &container_of(cl, struct type, cl)->field - -static struct closure_waitlist *closure_waitlist(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_waitlist, wait); - CL_FIELD(closure_with_waitlist_and_timer, wait); - default: - return NULL; - } -} - -static struct timer_list *closure_timer(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_timer, timer); - CL_FIELD(closure_with_waitlist_and_timer, timer); - default: - return NULL; - } -} - static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; BUG_ON(flags & CLOSURE_GUARD_MASK); - BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); /* Must deliver precisely one wakeup */ if (r == 1 && (flags & CLOSURE_SLEEPING)) @@ -59,23 +24,15 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) if (!r) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - /* CLOSURE_BLOCKING might be set - clear it */ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); closure_queue(cl); } else { struct closure *parent = cl->parent; - struct closure_waitlist *wait = closure_waitlist(cl); closure_fn *destructor = cl->fn; closure_debug_destroy(cl); - smp_mb(); - atomic_set(&cl->remaining, -1); - - if (wait) - closure_wake_up(wait); - if (destructor) destructor(cl); @@ -90,21 +47,20 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_sub); +EXPORT_SYMBOL(closure_sub); +/** + * closure_put - decrement a closure's refcount + */ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL_GPL(closure_put); - -static void set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} +EXPORT_SYMBOL(closure_put); +/** + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; @@ -129,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) cl = container_of(reverse, struct closure, list); reverse = llist_next(reverse); - set_waiting(cl, 0); + closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL_GPL(__closure_wake_up); +EXPORT_SYMBOL(__closure_wake_up); -bool closure_wait(struct closure_waitlist *list, struct closure *cl) +/** + * closure_wait - add a closure to a waitlist + * + * @waitlist will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) { if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; - set_waiting(cl, _RET_IP_); + closure_set_waiting(cl, _RET_IP_); atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &list->list); + llist_add(&cl->list, &waitlist->list); return true; } -EXPORT_SYMBOL_GPL(closure_wait); +EXPORT_SYMBOL(closure_wait); /** - * closure_sync() - sleep until a closure a closure has nothing left to wait on + * closure_sync - sleep until a closure a closure has nothing left to wait on * * Sleeps until the refcount hits 1 - the thread that's running the closure owns * the last refcount. @@ -169,93 +132,7 @@ void closure_sync(struct closure *cl) __closure_end_sleep(cl); } -EXPORT_SYMBOL_GPL(closure_sync); - -/** - * closure_trylock() - try to acquire the closure, without waiting - * @cl: closure to lock - * - * Returns true if the closure was succesfully locked. - */ -bool closure_trylock(struct closure *cl, struct closure *parent) -{ - if (atomic_cmpxchg(&cl->remaining, -1, - CLOSURE_REMAINING_INITIALIZER) != -1) - return false; - - closure_set_ret_ip(cl); - - smp_mb(); - cl->parent = parent; - if (parent) - closure_get(parent); - - closure_debug_create(cl); - return true; -} -EXPORT_SYMBOL_GPL(closure_trylock); - -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list) -{ - struct closure wait; - closure_init_stack(&wait); - - while (1) { - if (closure_trylock(cl, parent)) - return; - - closure_wait_event_sync(wait_list, &wait, - atomic_read(&cl->remaining) == -1); - } -} -EXPORT_SYMBOL_GPL(__closure_lock); - -static void closure_delay_timer_fn(unsigned long data) -{ - struct closure *cl = (struct closure *) data; - closure_sub(cl, CLOSURE_TIMER + 1); -} - -void do_closure_timer_init(struct closure *cl) -{ - struct timer_list *timer = closure_timer(cl); - - init_timer(timer); - timer->data = (unsigned long) cl; - timer->function = closure_delay_timer_fn; -} -EXPORT_SYMBOL_GPL(do_closure_timer_init); - -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer) -{ - if (atomic_read(&cl->remaining) & CLOSURE_TIMER) - return false; - - BUG_ON(timer_pending(timer)); - - timer->expires = jiffies + delay; - - atomic_add(CLOSURE_TIMER + 1, &cl->remaining); - add_timer(timer); - return true; -} -EXPORT_SYMBOL_GPL(__closure_delay); - -void __closure_flush(struct closure *cl, struct timer_list *timer) -{ - if (del_timer(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush); - -void __closure_flush_sync(struct closure *cl, struct timer_list *timer) -{ - if (del_timer_sync(timer)) - closure_sub(cl, CLOSURE_TIMER + 1); -} -EXPORT_SYMBOL_GPL(__closure_flush_sync); +EXPORT_SYMBOL(closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -273,7 +150,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_create); +EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -286,7 +163,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL_GPL(closure_debug_destroy); +EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *debug; @@ -304,14 +181,12 @@ static int debug_seq_show(struct seq_file *f, void *data) cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); - seq_printf(f, "%s%s%s%s%s%s\n", + seq_printf(f, "%s%s%s%s\n", test_bit(WORK_STRUCT_PENDING, work_data_bits(&cl->work)) ? "Q" : "", r & CLOSURE_RUNNING ? "R" : "", - r & CLOSURE_BLOCKING ? "B" : "", r & CLOSURE_STACK ? "S" : "", - r & CLOSURE_SLEEPING ? "Sl" : "", - r & CLOSURE_TIMER ? "T" : ""); + r & CLOSURE_SLEEPING ? "Sl" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pF\n", diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 00039924ea9..a08e3eeac3c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -72,30 +72,6 @@ * closure - _always_ use continue_at(). Doing so consistently will help * eliminate an entire class of particularly pernicious races. * - * For a closure to wait on an arbitrary event, we need to introduce waitlists: - * - * struct closure_waitlist list; - * closure_wait_event(list, cl, condition); - * closure_wake_up(wait_list); - * - * These work analagously to wait_event() and wake_up() - except that instead of - * operating on the current thread (for wait_event()) and lists of threads, they - * operate on an explicit closure and lists of closures. - * - * Because it's a closure we can now wait either synchronously or - * asynchronously. closure_wait_event() returns the current value of the - * condition, and if it returned false continue_at() or closure_sync() can be - * used to wait for it to become true. - * - * It's useful for waiting on things when you can't sleep in the context in - * which you must check the condition (perhaps a spinlock held, or you might be - * beneath generic_make_request() - in which case you can't sleep on IO). - * - * closure_wait_event() will wait either synchronously or asynchronously, - * depending on whether the closure is in blocking mode or not. You can pick a - * mode explicitly with closure_wait_event_sync() and - * closure_wait_event_async(), which do just what you might expect. - * * Lastly, you might have a wait list dedicated to a specific event, and have no * need for specifying the condition - you just want to wait until someone runs * closure_wake_up() on the appropriate wait list. In that case, just use @@ -121,55 +97,6 @@ * All this implies that a closure should typically be embedded in a particular * struct (which its refcount will normally control the lifetime of), and that * struct can very much be thought of as a stack frame. - * - * Locking: - * - * Closures are based on work items but they can be thought of as more like - * threads - in that like threads and unlike work items they have a well - * defined lifetime; they are created (with closure_init()) and eventually - * complete after a continue_at(cl, NULL, NULL). - * - * Suppose you've got some larger structure with a closure embedded in it that's - * used for periodically doing garbage collection. You only want one garbage - * collection happening at a time, so the natural thing to do is protect it with - * a lock. However, it's difficult to use a lock protecting a closure correctly - * because the unlock should come after the last continue_to() (additionally, if - * you're using the closure asynchronously a mutex won't work since a mutex has - * to be unlocked by the same process that locked it). - * - * So to make it less error prone and more efficient, we also have the ability - * to use closures as locks: - * - * closure_init_unlocked(); - * closure_trylock(); - * - * That's all we need for trylock() - the last closure_put() implicitly unlocks - * it for you. But for closure_lock(), we also need a wait list: - * - * struct closure_with_waitlist frobnicator_cl; - * - * closure_init_unlocked(&frobnicator_cl); - * closure_lock(&frobnicator_cl); - * - * A closure_with_waitlist embeds a closure and a wait list - much like struct - * delayed_work embeds a work item and a timer_list. The important thing is, use - * it exactly like you would a regular closure and closure_put() will magically - * handle everything for you. - * - * We've got closures that embed timers, too. They're called, appropriately - * enough: - * struct closure_with_timer; - * - * This gives you access to closure_delay(). It takes a refcount for a specified - * number of jiffies - you could then call closure_sync() (for a slightly - * convoluted version of msleep()) or continue_at() - which gives you the same - * effect as using a delayed work item, except you can reuse the work_struct - * already embedded in struct closure. - * - * Lastly, there's struct closure_with_waitlist_and_timer. It does what you - * probably expect, if you happen to need the features of both. (You don't - * really want to know how all this is implemented, but if I've done my job - * right you shouldn't have to care). */ struct closure; @@ -179,19 +106,8 @@ struct closure_waitlist { struct llist_head list; }; -enum closure_type { - TYPE_closure = 0, - TYPE_closure_with_waitlist = 1, - TYPE_closure_with_timer = 2, - TYPE_closure_with_waitlist_and_timer = 3, - MAX_CLOSURE_TYPE = 3, -}; - enum closure_state { /* - * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of - * waiting asynchronously - * * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by * the thread that owns the closure, and cleared by the thread that's * waking up the closure. @@ -200,10 +116,6 @@ enum closure_state { * - indicates that cl->task is valid and closure_put() may wake it up. * Only set or cleared by the thread that owns the closure. * - * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure - * has an outstanding timer. Must be set by the thread that owns the - * closure, and cleared by the timer function when the timer goes off. - * * The rest are for debugging and don't affect behaviour: * * CLOSURE_RUNNING: Set when a closure is running (i.e. by @@ -218,19 +130,17 @@ enum closure_state { * closure with this flag set */ - CLOSURE_BITS_START = (1 << 19), - CLOSURE_DESTRUCTOR = (1 << 19), - CLOSURE_BLOCKING = (1 << 21), - CLOSURE_WAITING = (1 << 23), - CLOSURE_SLEEPING = (1 << 25), - CLOSURE_TIMER = (1 << 27), + CLOSURE_BITS_START = (1 << 23), + CLOSURE_DESTRUCTOR = (1 << 23), + CLOSURE_WAITING = (1 << 25), + CLOSURE_SLEEPING = (1 << 27), CLOSURE_RUNNING = (1 << 29), CLOSURE_STACK = (1 << 31), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ - CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ + CLOSURE_RUNNING|CLOSURE_STACK) << 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -250,8 +160,6 @@ struct closure { atomic_t remaining; - enum closure_type type; - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e @@ -263,54 +171,12 @@ struct closure { #endif }; -struct closure_with_waitlist { - struct closure cl; - struct closure_waitlist wait; -}; - -struct closure_with_timer { - struct closure cl; - struct timer_list timer; -}; - -struct closure_with_waitlist_and_timer { - struct closure cl; - struct closure_waitlist wait; - struct timer_list timer; -}; - -extern unsigned invalid_closure_type(void); - -#define __CLOSURE_TYPE(cl, _t) \ - __builtin_types_compatible_p(typeof(cl), struct _t) \ - ? TYPE_ ## _t : \ - -#define __closure_type(cl) \ -( \ - __CLOSURE_TYPE(cl, closure) \ - __CLOSURE_TYPE(cl, closure_with_waitlist) \ - __CLOSURE_TYPE(cl, closure_with_timer) \ - __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ - invalid_closure_type() \ -) - void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); -void closure_queue(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void closure_sync(struct closure *cl); -bool closure_trylock(struct closure *cl, struct closure *parent); -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list); - -void do_closure_timer_init(struct closure *cl); -bool __closure_delay(struct closure *cl, unsigned long delay, - struct timer_list *timer); -void __closure_flush(struct closure *cl, struct timer_list *timer); -void __closure_flush_sync(struct closure *cl, struct timer_list *timer); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG void closure_debug_init(void); @@ -339,200 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl) #endif } -static inline void closure_get(struct closure *cl) +static inline void closure_set_waiting(struct closure *cl, unsigned long f) { #ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); + cl->waiting_on = f; #endif } -static inline void closure_set_stopped(struct closure *cl) +static inline void __closure_end_sleep(struct closure *cl) { - atomic_sub(CLOSURE_RUNNING, &cl->remaining); + __set_current_state(TASK_RUNNING); + + if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) + atomic_sub(CLOSURE_SLEEPING, &cl->remaining); } -static inline bool closure_is_stopped(struct closure *cl) +static inline void __closure_start_sleep(struct closure *cl) { - return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); + closure_set_ip(cl); + cl->task = current; + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + atomic_add(CLOSURE_SLEEPING, &cl->remaining); } -static inline bool closure_is_unlocked(struct closure *cl) +static inline void closure_set_stopped(struct closure *cl) { - return atomic_read(&cl->remaining) == -1; + atomic_sub(CLOSURE_RUNNING, &cl->remaining); } -static inline void do_closure_init(struct closure *cl, struct closure *parent, - bool running) +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) { - switch (cl->type) { - case TYPE_closure_with_timer: - case TYPE_closure_with_waitlist_and_timer: - do_closure_timer_init(cl); - default: - break; - } - - cl->parent = parent; - if (parent) - closure_get(parent); - - if (running) { - closure_debug_create(cl); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - } else - atomic_set(&cl->remaining, -1); - + BUG_ON(object_is_on_stack(cl)); closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic(); } -/* - * Hack to get at the embedded closure if there is one, by doing an unsafe cast: - * the result of __closure_type() is thrown away, it's used merely for type - * checking. - */ -#define __to_internal_closure(cl) \ -({ \ - BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ - (struct closure *) cl; \ -}) - -#define closure_init_type(cl, parent, running) \ -do { \ - struct closure *_cl = __to_internal_closure(cl); \ - _cl->type = __closure_type(*(cl)); \ - do_closure_init(_cl, parent, running); \ -} while (0) +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} /** - * __closure_init() - Initialize a closure, skipping the memset() - * - * May be used instead of closure_init() when memory has already been zeroed. + * closure_get - increment a closure's refcount */ -#define __closure_init(cl, parent) \ - closure_init_type(cl, parent, true) +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} /** - * closure_init() - Initialize a closure, setting the refcount to 1 + * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize * @parent: parent of the new closure. cl will take a refcount on it for its * lifetime; may be NULL. */ -#define closure_init(cl, parent) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - __closure_init(cl, parent); \ -} while (0) - -static inline void closure_init_stack(struct closure *cl) +static inline void closure_init(struct closure *cl, struct closure *parent) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| - CLOSURE_BLOCKING|CLOSURE_STACK); -} - -/** - * closure_init_unlocked() - Initialize a closure but leave it unlocked. - * @cl: closure to initialize - * - * For when the closure will be used as a lock. The closure may not be used - * until after a closure_lock() or closure_trylock(). - */ -#define closure_init_unlocked(cl) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - closure_init_type(cl, NULL, false); \ -} while (0) - -/** - * closure_lock() - lock and initialize a closure. - * @cl: the closure to lock - * @parent: the new parent for this closure - * - * The closure must be of one of the types that has a waitlist (otherwise we - * wouldn't be able to sleep on contention). - * - * @parent has exactly the same meaning as in closure_init(); if non null, the - * closure will take a reference on @parent which will be released when it is - * unlocked. - */ -#define closure_lock(cl, parent) \ - __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) - -/** - * closure_delay() - delay some number of jiffies - * @cl: the closure that will sleep - * @delay: the delay in jiffies - * - * Takes a refcount on @cl which will be released after @delay jiffies; this may - * be used to have a function run after a delay with continue_at(), or - * closure_sync() may be used for a convoluted version of msleep(). - */ -#define closure_delay(cl, delay) \ - __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) - -#define closure_flush(cl) \ - __closure_flush(__to_internal_closure(cl), &(cl)->timer) - -#define closure_flush_sync(cl) \ - __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) - -static inline void __closure_end_sleep(struct closure *cl) -{ - __set_current_state(TASK_RUNNING); + cl->parent = parent; + if (parent) + closure_get(parent); - if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) - atomic_sub(CLOSURE_SLEEPING, &cl->remaining); -} + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -static inline void __closure_start_sleep(struct closure *cl) -{ + closure_debug_create(cl); closure_set_ip(cl); - cl->task = current; - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) - atomic_add(CLOSURE_SLEEPING, &cl->remaining); -} - -/** - * closure_blocking() - returns true if the closure is in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - */ -static inline bool closure_blocking(struct closure *cl) -{ - return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; -} - -/** - * set_closure_blocking() - put a closure in blocking mode. - * - * If a closure is in blocking mode, closure_wait_event() will sleep until the - * condition is true instead of waiting asynchronously. - * - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void set_closure_blocking(struct closure *cl) -{ - if (!closure_blocking(cl)) - atomic_add(CLOSURE_BLOCKING, &cl->remaining); } -/* - * Not thread safe - can only be called by the thread running the closure. - */ -static inline void clear_closure_blocking(struct closure *cl) +static inline void closure_init_stack(struct closure *cl) { - if (closure_blocking(cl)) - atomic_sub(CLOSURE_BLOCKING, &cl->remaining); + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); } /** - * closure_wake_up() - wake up all closures on a wait list. + * closure_wake_up - wake up all closures on a wait list. */ static inline void closure_wake_up(struct closure_waitlist *list) { @@ -540,96 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list) __closure_wake_up(list); } -/* - * Wait on an event, synchronously or asynchronously - analogous to wait_event() - * but for closures. - * - * The loop is oddly structured so as to avoid a race; we must check the - * condition again after we've added ourself to the waitlist. We know if we were - * already on the waitlist because closure_wait() returns false; thus, we only - * schedule or break if closure_wait() returns false. If it returns true, we - * just loop again - rechecking the condition. - * - * The __closure_wake_up() is necessary because we may race with the event - * becoming true; i.e. we see event false -> wait -> recheck condition, but the - * thread that made the event true may have called closure_wake_up() before we - * added ourself to the wait list. - * - * We have to call closure_sync() at the end instead of just - * __closure_end_sleep() because a different thread might've called - * closure_wake_up() before us and gotten preempted before they dropped the - * refcount on our closure. If this was a stack allocated closure, that would be - * bad. - */ -#define __closure_wait_event(list, cl, condition, _block) \ -({ \ - bool block = _block; \ - typeof(condition) ret; \ - \ - while (1) { \ - ret = (condition); \ - if (ret) { \ - __closure_wake_up(list); \ - if (block) \ - closure_sync(cl); \ - \ - break; \ - } \ - \ - if (block) \ - __closure_start_sleep(cl); \ - \ - if (!closure_wait(list, cl)) { \ - if (!block) \ - break; \ - \ - schedule(); \ - } \ - } \ - \ - ret; \ -}) - /** - * closure_wait_event() - wait on a condition, synchronously or asynchronously. - * @list: the wait list to wait on - * @cl: the closure that is doing the waiting - * @condition: a C expression for the event to wait for - * - * If the closure is in blocking mode, sleeps until the @condition evaluates to - * true - exactly like wait_event(). + * continue_at - jump to another function with barrier * - * If the closure is not in blocking mode, waits asynchronously; if the - * condition is currently false the @cl is put onto @list and returns. @list - * owns a refcount on @cl; closure_sync() or continue_at() may be used later to - * wait for another thread to wake up @list, which drops the refcount on @cl. + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). * - * Returns the value of @condition; @cl will be on @list iff @condition was - * false. + * NOTE: This macro expands to a return in the calling function! * - * closure_wake_up(@list) must be called after changing any variable that could - * cause @condition to become true. + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. */ -#define closure_wait_event(list, cl, condition) \ - __closure_wait_event(list, cl, condition, closure_blocking(cl)) - -#define closure_wait_event_async(list, cl, condition) \ - __closure_wait_event(list, cl, condition, false) - -#define closure_wait_event_sync(list, cl, condition) \ - __closure_wait_event(list, cl, condition, true) - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - BUG_ON(object_is_on_stack(cl)); - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); -} - #define continue_at(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ @@ -637,15 +323,44 @@ do { \ return; \ } while (0) +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * NOTE: like continue_at(), this macro expands to a return in the caller! + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ #define continue_at_nobarrier(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ - closure_queue(cl); \ + closure_queue(_cl); \ return; \ } while (0) +/** + * closure_return - finish execution of a closure, with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ #define closure_return_with_destructor(_cl, _destructor) \ do { \ set_closure_fn(_cl, _destructor, NULL); \ @@ -653,6 +368,13 @@ do { \ return; \ } while (0) +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ static inline void closure_call(struct closure *cl, closure_fn fn, struct workqueue_struct *wq, struct closure *parent) @@ -661,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, continue_at_nobarrier(cl, fn, wq); } -static inline void closure_trylock_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - if (closure_trylock(cl, parent)) - continue_at_nobarrier(cl, fn, wq); -} - #endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 88e6411eab4..8b1f1d5c181 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,7 +8,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" +#include "extents.h" #include <linux/console.h> #include <linux/debugfs.h> @@ -18,305 +18,130 @@ static struct dentry *debug; -const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size) - return "bad, length too big"; - if (bucket < ca->sb.first_bucket) - return "bad, short offset"; - if (bucket >= ca->sb.nbuckets) - return "bad, offset past end of device"; - if (ptr_stale(c, k, i)) - return "stale"; - } - - if (!bkey_cmp(k, &ZERO_KEY)) - return "bad, null key"; - if (!KEY_PTRS(k)) - return "bad, no pointers"; - if (!KEY_SIZE(k)) - return "zeroed key"; - return ""; -} - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - unsigned i = 0; - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); - - if (KEY_PTRS(k)) - while (1) { - p("%llu:%llu gen %llu", - PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); - - if (++i == KEY_PTRS(k)) - break; - - p(", "); - } - - p("]"); - - if (KEY_DIRTY(k)) - p(" dirty"); - if (KEY_CSUM(k)) - p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); -#undef p - return out - buf; -} - -int bch_btree_to_text(char *buf, size_t size, const struct btree *b) -{ - return scnprintf(buf, size, "%zu level %i/%i", - PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); -} - -#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) - -static bool skipped_backwards(struct btree *b, struct bkey *k) -{ - return bkey_cmp(k, (!b->level) - ? &START_KEY(bkey_next(k)) - : bkey_next(k)) > 0; -} - -static void dump_bset(struct btree *b, struct bset *i) -{ - struct bkey *k; - unsigned j; - char buf[80]; - - for (k = i->start; k < end(i); k = bkey_next(k)) { - bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), - (uint64_t *) k - i->d, i->keys, buf); - - for (j = 0; j < KEY_PTRS(k); j++) { - size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); - - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", - PTR_BUCKET(b->c, k, j)->prio); - } - - printk(" %s\n", bch_ptr_status(b->c, k)); - - if (bkey_next(k) < end(i) && - skipped_backwards(b, k)) - printk(KERN_ERR "Key skipped backwards\n"); - } -} - -#endif - #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *b, struct bset *new) +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ + block_bytes(b->c)) + +void bch_btree_verify(struct btree *b) { struct btree *v = b->c->verify_data; - struct closure cl; - closure_init_stack(&cl); + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; - if (!b->c->verify) + if (!b->c->verify || !b->c->verify_ondisk) return; - closure_wait_event(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); - + down(&b->io_mutex); mutex_lock(&b->c->verify_lock); + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; + bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; + v->keys.ops = b->keys.ops; + + bio = bch_bbio_alloc(b->c); + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bch_bio_map(bio, sorted); - bch_btree_node_read(v); - closure_wait_event(&v->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); + submit_bio_wait(REQ_META|READ_SYNC, bio); + bch_bbio_free(bio, b->c); - if (new->keys != v->sets[0].data->keys || - memcmp(new->start, - v->sets[0].data->start, - (void *) end(new) - (void *) new->start)) { - unsigned i, j; + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->keys.set->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + struct bset *i; + unsigned j; console_lock(); - printk(KERN_ERR "*** original memory node:\n"); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); + printk(KERN_ERR "*** in memory:\n"); + bch_dump_bset(&b->keys, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); + bch_dump_bset(&v->keys, sorted, 0); - printk(KERN_ERR "*** sorted memory node:\n"); - dump_bset(b, new); + for_each_written_bset(b, ondisk, i) { + unsigned block = ((void *) i - (void *) ondisk) / + block_bytes(b->c); + + printk(KERN_ERR "*** on disk block %u:\n", block); + bch_dump_bset(&b->keys, i, block); + } - printk(KERN_ERR "*** on disk node:\n"); - dump_bset(v, v->sets[0].data); + printk(KERN_ERR "*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c)); - for (j = 0; j < new->keys; j++) - if (new->d[j] != v->sets[0].data->d[j]) + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) break; + printk(KERN_ERR "b->written %u\n", b->written); + console_unlock(); panic("verify failed at %u\n", j); } mutex_unlock(&b->c->verify_lock); + up(&b->io_mutex); } -static void data_verify_endio(struct bio *bio, int error) -{ - struct closure *cl = bio->bi_private; - closure_put(cl); -} - -void bch_data_verify(struct search *s) +void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct closure *cl = &s->cl; struct bio *check; - struct bio_vec *bv; + struct bio_vec bv, *bv2; + struct bvec_iter iter; int i; - if (!s->unaligned_bvec) - bio_for_each_segment(bv, s->orig_bio, i) - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; - - check = bio_clone(s->orig_bio, GFP_NOIO); + check = bio_clone(bio, GFP_NOIO); if (!check) return; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; - check->bi_rw = READ_SYNC; - check->bi_private = cl; - check->bi_end_io = data_verify_endio; - - closure_bio_submit(check, cl, &dc->disk); - closure_sync(cl); + submit_bio_wait(READ_SYNC, check); - bio_for_each_segment(bv, s->orig_bio, i) { - void *p1 = kmap(bv->bv_page); - void *p2 = kmap(check->bi_io_vec[i].bv_page); + bio_for_each_segment(bv, bio, iter) { + void *p1 = kmap_atomic(bv.bv_page); + void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); - if (memcmp(p1 + bv->bv_offset, - p2 + bv->bv_offset, - bv->bv_len)) - printk(KERN_ERR - "bcache (%s): verify failed at sector %llu\n", - bdevname(dc->bdev, name), - (uint64_t) s->orig_bio->bi_sector); + cache_set_err_on(memcmp(p1 + bv.bv_offset, + p2 + bv.bv_offset, + bv.bv_len), + dc->disk.c, + "verify failed at dev %s sector %llu", + bdevname(dc->bdev, name), + (uint64_t) bio->bi_iter.bi_sector); - kunmap(bv->bv_page); - kunmap(check->bi_io_vec[i].bv_page); + kunmap_atomic(p1); } - __bio_for_each_segment(bv, check, i, 0) - __free_page(bv->bv_page); + bio_for_each_segment_all(bv2, check, i) + __free_page(bv2->bv_page); out_put: bio_put(check); } #endif -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *b) -{ - unsigned ret = 0; - struct btree_iter iter; - struct bkey *k; - - if (!b->level) - for_each_key(b, k, &iter) - ret += KEY_SIZE(k); - return ret; -} - -static void vdump_bucket_and_panic(struct btree *b, const char *fmt, - va_list args) -{ - unsigned i; - char buf[80]; - - console_lock(); - - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - - vprintk(fmt, args); - - console_unlock(); - - bch_btree_to_text(buf, sizeof(buf), b); - panic("at %s\n", buf); -} - -void bch_check_key_order_msg(struct btree *b, struct bset *i, - const char *fmt, ...) -{ - struct bkey *k; - - if (!i->keys) - return; - - for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) - if (skipped_backwards(b, k)) { - va_list args; - va_start(args, fmt); - - vdump_bucket_and_panic(b, fmt, args); - va_end(args); - } -} - -void bch_check_keys(struct btree *b, const char *fmt, ...) -{ - va_list args; - struct bkey *k, *p = NULL; - struct btree_iter iter; - - if (b->level) - return; - - for_each_key(b, k, &iter) { - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { - printk(KERN_ERR "Keys out of order:\n"); - goto bug; - } - - if (bch_ptr_invalid(b, k)) - continue; - - if (p && bkey_cmp(p, &START_KEY(k)) > 0) { - printk(KERN_ERR "Overlapping keys:\n"); - goto bug; - } - p = k; - } - return; -bug: - va_start(args, fmt); - vdump_bucket_and_panic(b, fmt, args); - va_end(args); -} - -#endif - #ifdef CONFIG_DEBUG_FS /* XXX: cache set refcounting */ @@ -361,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1c39b5a2489..1f63c195d24 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,40 +1,27 @@ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H -/* Btree/bkey debug printing */ - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); -int bch_btree_to_text(char *buf, size_t size, const struct btree *b); - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *); -void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); -void bch_check_keys(struct btree *, const char *, ...); - -#define bch_check_key_order(b, i) \ - bch_check_key_order_msg(b, i, "keys out of order") -#define EBUG_ON(cond) BUG_ON(cond) - -#else /* EDEBUG */ - -#define bch_count_data(b) 0 -#define bch_check_key_order(b, i) do {} while (0) -#define bch_check_key_order_msg(b, i, ...) do {} while (0) -#define bch_check_keys(b, ...) do {} while (0) -#define EBUG_ON(cond) do {} while (0) - -#endif +struct bio; +struct cached_dev; +struct cache_set; #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *, struct bset *); -void bch_data_verify(struct search *); +void bch_btree_verify(struct btree *); +void bch_data_verify(struct cached_dev *, struct bio *); + +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ -static inline void bch_btree_verify(struct btree *b, struct bset *i) {} -static inline void bch_data_verify(struct search *s) {}; +static inline void bch_btree_verify(struct btree *b) {} +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} + +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 #endif diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 00000000000..3a0de4cf977 --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + printk(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static void bch_subtract_dirty(struct bkey *k, + struct cache_set *c, + uint64_t offset, + int sectors) +{ + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); +} + +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + uint64_t old_offset; + unsigned old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (!bch_bkey_equal_header(k, replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + bch_subtract_dirty(k, c, KEY_START(insert), + KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(b->c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + return true; +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + (!GC_MARK(g) || + GC_MARK(g) == GC_MARK_METADATA || + (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + struct bucket *g; + unsigned i, stale; + + if (!KEY_PTRS(k) || + bch_extent_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 00000000000..e4e23409782 --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,13 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +void bch_extent_to_text(char *, size_t, const struct bkey *); +bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 9056632995b..fa028fa82df 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -11,178 +11,40 @@ #include <linux/blkdev.h> -static void bch_bi_idx_hack_endio(struct bio *bio, int error) -{ - struct bio *p = bio->bi_private; - - bio_endio(p, error); - bio_put(bio); -} - -static void bch_generic_make_request_hack(struct bio *bio) -{ - if (bio->bi_idx) { - struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); - - memcpy(clone->bi_io_vec, - bio_iovec(bio), - bio_segments(bio) * sizeof(struct bio_vec)); - - clone->bi_sector = bio->bi_sector; - clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; - clone->bi_vcnt = bio_segments(bio); - clone->bi_size = bio->bi_size; - - clone->bi_private = bio; - clone->bi_end_io = bch_bi_idx_hack_endio; - - bio = clone; - } - - /* - * Hack, since drivers that clone bios clone up to bi_max_vecs, but our - * bios might have had more than that (before we split them per device - * limitations). - * - * To be taken out once immutable bvec stuff is in. - */ - bio->bi_max_vecs = bio->bi_vcnt; - - generic_make_request(bio); -} - -/** - * bch_bio_split - split a bio - * @bio: bio to split - * @sectors: number of sectors to split from the front of @bio - * @gfp: gfp mask - * @bs: bio set to allocate from - * - * Allocates and returns a new bio which represents @sectors from the start of - * @bio, and updates @bio to represent the remaining sectors. - * - * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio - * unchanged. - * - * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a - * bvec boundry; it is the caller's responsibility to ensure that @bio is not - * freed before the split. - */ -struct bio *bch_bio_split(struct bio *bio, int sectors, - gfp_t gfp, struct bio_set *bs) -{ - unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; - struct bio_vec *bv; - struct bio *ret = NULL; - - BUG_ON(sectors <= 0); - - if (sectors >= bio_sectors(bio)) - return bio; - - if (bio->bi_rw & REQ_DISCARD) { - ret = bio_alloc_bioset(gfp, 1, bs); - if (!ret) - return NULL; - idx = 0; - goto out; - } - - bio_for_each_segment(bv, bio, idx) { - vcnt = idx - bio->bi_idx; - - if (!nbytes) { - ret = bio_alloc_bioset(gfp, vcnt, bs); - if (!ret) - return NULL; - - memcpy(ret->bi_io_vec, bio_iovec(bio), - sizeof(struct bio_vec) * vcnt); - - break; - } else if (nbytes < bv->bv_len) { - ret = bio_alloc_bioset(gfp, ++vcnt, bs); - if (!ret) - return NULL; - - memcpy(ret->bi_io_vec, bio_iovec(bio), - sizeof(struct bio_vec) * vcnt); - - ret->bi_io_vec[vcnt - 1].bv_len = nbytes; - bv->bv_offset += nbytes; - bv->bv_len -= nbytes; - break; - } - - nbytes -= bv->bv_len; - } -out: - ret->bi_bdev = bio->bi_bdev; - ret->bi_sector = bio->bi_sector; - ret->bi_size = sectors << 9; - ret->bi_rw = bio->bi_rw; - ret->bi_vcnt = vcnt; - ret->bi_max_vecs = vcnt; - - bio->bi_sector += sectors; - bio->bi_size -= sectors << 9; - bio->bi_idx = idx; - - if (bio_integrity(bio)) { - if (bio_integrity_clone(ret, bio, gfp)) { - bio_put(ret); - return NULL; - } - - bio_integrity_trim(ret, 0, bio_sectors(ret)); - bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); - } - - return ret; -} - static unsigned bch_bio_max_sectors(struct bio *bio) { - unsigned ret = bio_sectors(bio); struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, - queue_max_segments(q)); + struct bio_vec bv; + struct bvec_iter iter; + unsigned ret = 0, seg = 0; if (bio->bi_rw & REQ_DISCARD) - return min(ret, q->limits.max_discard_sectors); - - if (bio_segments(bio) > max_segments || - q->merge_bvec_fn) { - struct bio_vec *bv; - int i, seg = 0; - - ret = 0; - - bio_for_each_segment(bv, bio, i) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = ret << 9, - .bi_rw = bio->bi_rw, - }; - - if (seg == max_segments) - break; + return min(bio_sectors(bio), q->limits.max_discard_sectors); + + bio_for_each_segment(bv, bio, iter) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = ret << 9, + .bi_rw = bio->bi_rw, + }; + + if (seg == min_t(unsigned, BIO_MAX_PAGES, + queue_max_segments(q))) + break; - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) - break; + if (q->merge_bvec_fn && + q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len) + break; - seg++; - ret += bv->bv_len >> 9; - } + seg++; + ret += bv.bv_len >> 9; } ret = min(ret, queue_max_sectors(q)); WARN_ON(!ret); - ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); + ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9); return ret; } @@ -193,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl) s->bio->bi_end_io = s->bi_end_io; s->bio->bi_private = s->bi_private; - bio_endio(s->bio, 0); + bio_endio_nodec(s->bio, 0); closure_debug_destroy(&s->cl); mempool_free(s, s->p->bio_split_hook); @@ -232,19 +94,19 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) bio_get(bio); do { - n = bch_bio_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s->p->bio_split); + n = bio_next_split(bio, bch_bio_max_sectors(bio), + GFP_NOIO, s->p->bio_split); n->bi_end_io = bch_bio_submit_split_endio; n->bi_private = &s->cl; closure_get(&s->cl); - bch_generic_make_request_hack(n); + generic_make_request(n); } while (n != bio); continue_at(&s->cl, bch_bio_submit_split_done, NULL); submit: - bch_generic_make_request_hack(bio); + generic_make_request(bio); } /* Bios with headers */ @@ -272,8 +134,8 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) { struct bbio *b = container_of(bio, struct bbio, bio); - bio->bi_sector = PTR_OFFSET(&b->key, 0); - bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; b->submit_time_us = local_clock_us(); closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ba95ab84b2b..59e82021b5b 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" #include <trace/events/bcache.h> @@ -31,35 +30,38 @@ static void journal_read_endio(struct bio *bio, int error) } static int journal_read_bucket(struct cache *ca, struct list_head *list, - struct btree_op *op, unsigned bucket_index) + unsigned bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; unsigned len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); - pr_debug("reading %llu", (uint64_t) bucket); + closure_init_stack(&cl); + + pr_debug("reading %u", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS * 8); + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); - bio->bi_sector = bucket + offset; + bio->bi_iter.bi_sector = bucket + offset; bio->bi_bdev = ca->bdev; bio->bi_rw = READ; - bio->bi_size = len << 9; + bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; - bio->bi_private = &op->cl; + bio->bi_private = &cl; bch_bio_map(bio, data); - closure_bio_submit(bio, &op->cl, ca); - closure_sync(&op->cl); + closure_bio_submit(bio, &cl, ca); + closure_sync(&cl); /* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means @@ -72,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic", bucket_index); return ret; + } - if (bytes > left << 9) + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } if (bytes > len << 9) goto reread; - if (j->csum != csum_set(j)) + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } - blocks = set_blocks(j, ca->set); + blocks = set_blocks(j, block_bytes(ca->set)); while (!list_empty(list)) { i = list_first_entry(list, @@ -129,12 +140,11 @@ next_set: return ret; } -int bch_journal_read(struct cache_set *c, struct list_head *list, - struct btree_op *op) +int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, op, b); \ + int ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -153,7 +163,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); pr_debug("%u journal buckets", ca->sb.njournal_buckets); - /* Read journal buckets ordered by golden ratio hash to quickly + /* + * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ for (i = 0; i < ca->sb.njournal_buckets; i++) { @@ -166,18 +177,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, goto bsearch; } - /* If that fails, check all the buckets we haven't checked + /* + * If that fails, check all the buckets we haven't checked * already */ pr_debug("falling back to linear search"); - for (l = 0; l < ca->sb.njournal_buckets; l++) { - if (test_bit(l, bitmap)) - continue; - + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); + l < ca->sb.njournal_buckets; + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) if (read_bucket(l)) goto bsearch; - } + + if (list_empty(list)) + continue; bsearch: /* Binary search */ m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); @@ -197,10 +210,12 @@ bsearch: r = m; } - /* Read buckets in reverse order until we stop finding more + /* + * Read buckets in reverse order until we stop finding more * journal entries */ - pr_debug("finishing up"); + pr_debug("finishing up: m %u njournal_buckets %u", + m, ca->sb.njournal_buckets); l = m; while (1) { @@ -222,15 +237,22 @@ bsearch: for (i = 0; i < ca->sb.njournal_buckets; i++) if (ja->seq[i] > seq) { seq = ja->seq[i]; - ja->cur_idx = ja->discard_idx = - ja->last_idx = i; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; } } - c->journal.seq = list_entry(list->prev, - struct journal_replay, - list)->j.seq; + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; return 0; #undef read_bucket @@ -268,26 +290,20 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; - for (j = 0; j < KEY_PTRS(k); j++) { - struct bucket *g = PTR_BUCKET(c, k, j); - atomic_inc(&g->pin); + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - if (g->prio == BTREE_PRIO && - !ptr_stale(c, k, j)) - g->prio = INITIAL_PRIO; - } - - __bch_btree_mark_key(c, 0, k); + bch_initial_mark_key(c, 0, k); } } } -int bch_journal_replay(struct cache_set *s, struct list_head *list, - struct btree_op *op) +int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; struct bkey *k; @@ -295,31 +311,27 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, list_entry(list->prev, struct journal_replay, list); uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - if (n != i->j.seq) - pr_err( - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { trace_bcache_journal_replay_key(k); - bkey_copy(op->keys.top, k); - bch_keylist_push(&op->keys); - - op->journal = i->pin; - atomic_inc(op->journal); + bch_keylist_init_single(&keylist, k); - ret = bch_btree_insert(op, s); + ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) goto err; - BUG_ON(!bch_keylist_empty(&op->keys)); + BUG_ON(!bch_keylist_empty(&keylist)); keys++; cond_resched(); @@ -333,14 +345,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, pr_info("journal replay done, %i keys in %i entries, seq %llu", keys, entries, end); - +err: while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); list_del(&i->list); kfree(i); } -err: - closure_sync(&op->cl); + return ret; } @@ -352,48 +363,34 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best = NULL; - unsigned iter; - - for_each_cached_btree(b, c, iter) { - if (!down_write_trylock(&b->lock)) - continue; - - if (!btree_node_dirty(b) || - !btree_current_write(b)->journal) { - rw_unlock(true, b); - continue; + struct btree *b, *best; + unsigned i; +retry: + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; + } } - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best), - btree_current_write(b))) { - rw_unlock(true, best); - best = b; - } else - rw_unlock(true, b); - } - - if (best) - goto out; - - /* We can't find the best btree node, just pick the first */ - list_for_each_entry(b, &c->btree_cache, list) - if (!b->level && btree_node_dirty(b)) { - best = b; - rw_lock(true, best, best->level); - goto found; + b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + /* We raced */ + goto retry; } -out: - if (!best) - return; -found: - if (btree_node_dirty(best)) - bch_btree_node_write(best, NULL); - rw_unlock(true, best); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -428,7 +425,7 @@ static void do_journal_discard(struct cache *ca) return; } - switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { + switch (atomic_read(&ja->discard_in_flight)) { case DISCARD_IN_FLIGHT: return; @@ -446,13 +443,13 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); bio_init(bio); - bio->bi_sector = bucket_to_sector(ca->set, + bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_bdev = ca->bdev; bio->bi_rw = REQ_WRITE|REQ_DISCARD; bio->bi_max_vecs = 1; bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_size = bucket_bytes(ca); + bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; closure_get(&ca->set->cl); @@ -489,7 +486,7 @@ static void journal_reclaim(struct cache_set *c) do_journal_discard(ca); if (c->journal.blocks_free) - return; + goto out; /* * Allocate: @@ -515,7 +512,7 @@ static void journal_reclaim(struct cache_set *c) if (n) c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - +out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); } @@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j) atomic_set(&fifo_back(&j->pin), 1); j->cur->data->seq = ++j->seq; + j->cur->dirty = false; j->cur->need_write = false; j->cur->data->keys = 0; @@ -548,51 +546,46 @@ static void journal_write_endio(struct bio *bio, int error) struct journal_write *w = bio->bi_private; cache_set_err_on(error, w->c, "journal io error"); - closure_put(&w->c->journal.io.cl); + closure_put(&w->c->journal.io); } static void journal_write(struct closure *); static void journal_write_done(struct closure *cl) { - struct journal *j = container_of(cl, struct journal, io.cl); - struct cache_set *c = container_of(j, struct cache_set, journal); - + struct journal *j = container_of(cl, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; __closure_wake_up(&w->wait); + continue_at_nobarrier(cl, journal_write, system_wq); +} - if (c->journal_delay_ms) - closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); +static void journal_write_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); - continue_at(cl, journal_write, system_wq); + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); } static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; struct bio *bio; struct bio_list list; bio_list_init(&list); if (!w->need_write) { - /* - * XXX: have to unlock closure before we unlock journal lock, - * else we race with bch_journal(). But this way we race - * against cache set unregister. Doh. - */ - set_closure_fn(cl, NULL, NULL); - closure_sub(cl, CLOSURE_RUNNING + 1); - spin_unlock(&c->journal.lock); - return; + closure_return_with_destructor(cl, journal_write_unlock); } else if (journal_full(&c->journal)) { journal_reclaim(c); spin_unlock(&c->journal.lock); @@ -601,7 +594,7 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write, system_wq); } - c->journal.blocks_free -= set_blocks(w->data, c); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); w->data->btree_level = c->root->level; @@ -611,7 +604,7 @@ static void journal_write_unlocked(struct closure *cl) for_each_cache(ca, c, i) w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - w->data->magic = jset_magic(c); + w->data->magic = jset_magic(&c->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); @@ -623,10 +616,10 @@ static void journal_write_unlocked(struct closure *cl) atomic_long_add(sectors, &ca->meta_sectors_written); bio_reset(bio); - bio->bi_sector = PTR_OFFSET(k, i); + bio->bi_iter.bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; - bio->bi_size = sectors << 9; + bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = w; @@ -654,120 +647,144 @@ static void journal_write_unlocked(struct closure *cl) static void journal_write(struct closure *cl) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); spin_lock(&c->journal.lock); journal_write_unlocked(cl); } -static void __journal_try_write(struct cache_set *c, bool noflush) +static void journal_try_write(struct cache_set *c) __releases(c->journal.lock) { - struct closure *cl = &c->journal.io.cl; + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; - if (!closure_trylock(cl, &c->cl)) - spin_unlock(&c->journal.lock); - else if (noflush && journal_full(&c->journal)) { + w->need_write = true; + + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { spin_unlock(&c->journal.lock); - continue_at(cl, journal_write, system_wq); - } else - journal_write_unlocked(cl); + } } -#define journal_try_write(c) __journal_try_write(c, false) - -void bch_journal_meta(struct cache_set *c, struct closure *cl) +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned nkeys) { - struct journal_write *w; + size_t sectors; + struct closure cl; + bool wait = false; - if (CACHE_SYNC(&c->sb)) { - spin_lock(&c->journal.lock); + closure_init_stack(&cl); + + spin_lock(&c->journal.lock); + + while (1) { + struct journal_write *w = c->journal.cur; + + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(c)) * c->sb.block_size; + + if (sectors <= min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + + if (!journal_full(&c->journal)) { + if (wait) + trace_bcache_journal_entry_full(c); - w = c->journal.cur; - w->need_write = true; + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); - if (cl) - BUG_ON(!closure_wait(&w->wait, cl)); + journal_try_write(c); /* unlocks */ + } else { + if (wait) + trace_bcache_journal_full(c); - __journal_try_write(c, true); + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + } + + closure_sync(&cl); + spin_lock(&c->journal.lock); + wait = true; } } +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); +} + /* * Entry point to the journalling code - bio_insert() and btree_invalidate() * pass bch_journal() a list of keys to be journalled, and then * bch_journal() hands those same keys off to btree_insert_async() */ -void bch_journal(struct closure *cl) +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct cache_set *c = op->c; struct journal_write *w; - size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; + atomic_t *ret; - if (op->type != BTREE_INSERT || - !CACHE_SYNC(&c->sb)) - goto out; + if (!CACHE_SYNC(&c->sb)) + return NULL; - /* - * If we're looping because we errored, might already be waiting on - * another journal write: - */ - while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) - closure_sync(cl->parent); - - spin_lock(&c->journal.lock); + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - if (journal_full(&c->journal)) { - trace_bcache_journal_full(c); + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); - closure_wait(&c->journal.wait, cl); + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); - journal_reclaim(c); + if (parent) { + closure_wait(&w->wait, parent); + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; + schedule_delayed_work(&c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { spin_unlock(&c->journal.lock); - - btree_flush_write(c); - continue_at(cl, bch_journal, bcache_wq); } - w = c->journal.cur; - w->need_write = true; - b = __set_blocks(w->data, w->data->keys + n, c); - - if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || - b > c->journal.blocks_free) { - trace_bcache_journal_entry_full(c); - - /* - * XXX: If we were inserting so many keys that they won't fit in - * an _empty_ journal write, we'll deadlock. For now, handle - * this in bch_keylist_realloc() - but something to think about. - */ - BUG_ON(!w->data->keys); - - BUG_ON(!closure_wait(&w->wait, cl)); - - closure_flush(&c->journal.io); - - journal_try_write(c); - continue_at(cl, bch_journal, bcache_wq); - } - memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); - w->data->keys += n; + return ret; +} - op->journal = &fifo_back(&c->journal.pin); - atomic_inc(op->journal); +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; - if (op->flush_journal) { - closure_flush(&c->journal.io); - closure_wait(&w->wait, cl->parent); - } + bch_keylist_init(&keys); - journal_try_write(c); -out: - bch_btree_insert_async(cl); + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); } void bch_journal_free(struct cache_set *c) @@ -781,8 +798,8 @@ int bch_journal_alloc(struct cache_set *c) { struct journal *j = &c->journal; - closure_init_unlocked(&j->io); spin_lock_init(&j->lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 3d7851274b0..e3c39457afb 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -75,43 +75,6 @@ * nodes that are pinning the oldest journal entries first. */ -#define BCACHE_JSET_VERSION_UUIDv1 1 -/* Always latest UUID format */ -#define BCACHE_JSET_VERSION_UUID 1 -#define BCACHE_JSET_VERSION 1 - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - uint64_t last_seq; - - BKEY_PADDED(uuid_bucket); - BKEY_PADDED(btree_root); - uint16_t btree_level; - uint16_t pad[3]; - - uint64_t prio_bucket[MAX_CACHES_PER_SET]; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -132,6 +95,7 @@ struct journal_write { struct cache_set *c; struct closure_waitlist wait; + bool dirty; bool need_write; }; @@ -140,7 +104,9 @@ struct journal { spinlock_t lock; /* used when waiting because the journal was full */ struct closure_waitlist wait; - struct closure_with_timer io; + struct closure io; + int io_in_flight; + struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ unsigned blocks_free; @@ -188,8 +154,7 @@ struct journal_device { }; #define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)->journal) > \ - fifo_idx(&(c)->journal.pin, (r)->journal)) + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) #define JOURNAL_PIN 20000 @@ -199,15 +164,14 @@ struct journal_device { struct closure; struct cache_set; struct btree_op; +struct keylist; -void bch_journal(struct closure *); +atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); void bch_journal_next(struct journal *); void bch_journal_mark(struct cache_set *, struct list_head *); void bch_journal_meta(struct cache_set *, struct closure *); -int bch_journal_read(struct cache_set *, struct list_head *, - struct btree_op *); -int bch_journal_replay(struct cache_set *, struct list_head *, - struct btree_op *); +int bch_journal_read(struct cache_set *, struct list_head *); +int bch_journal_replay(struct cache_set *, struct list_head *); void bch_journal_free(struct cache_set *); int bch_journal_alloc(struct cache_set *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1a3b4f4786c..cd7490311e5 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -12,8 +12,9 @@ #include <trace/events/bcache.h> struct moving_io { + struct closure cl; struct keybuf_key *w; - struct search s; + struct data_insert_op op; struct bbio bio; }; @@ -23,13 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) moving_gc_keys); unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct cache *ca = PTR_CACHE(c, k, i); - struct bucket *g = PTR_BUCKET(c, k, i); - - if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + GC_MOVE(PTR_BUCKET(c, k, i))) return true; - } return false; } @@ -38,13 +36,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) static void moving_io_destructor(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); kfree(io); } static void write_moving_finish(struct closure *cl) { - struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; struct bio_vec *bv; int i; @@ -52,26 +50,30 @@ static void write_moving_finish(struct closure *cl) bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); - if (io->s.op.insert_collision) + if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); - bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); - atomic_dec_bug(&io->s.op.c->in_flight); - closure_wake_up(&io->s.op.c->moving_gc_wait); + up(&io->op.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } static void read_moving_endio(struct bio *bio, int error) { + struct bbio *b = container_of(bio, struct bbio, bio); struct moving_io *io = container_of(bio->bi_private, - struct moving_io, s.cl); + struct moving_io, cl); if (error) - io->s.error = error; + io->op.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.error = -EINTR; + } - bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -82,57 +84,56 @@ static void moving_init(struct moving_io *io) bio_get(bio); bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - bio->bi_size = KEY_SIZE(&io->w->key) << 9; + bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS); - bio->bi_private = &io->s.cl; + bio->bi_private = &io->cl; bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } static void write_moving(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct data_insert_op *op = &io->op; - if (!s->error) { + if (!op->error) { moving_init(io); - io->bio.bio.bi_sector = KEY_START(&io->w->key); - s->op.lock = -1; - s->op.write_prio = 1; - s->op.cache_bio = &io->bio.bio; + io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); + op->write_prio = 1; + op->bio = &io->bio.bio; - s->writeback = KEY_DIRTY(&io->w->key); - s->op.csum = KEY_CSUM(&io->w->key); + op->writeback = KEY_DIRTY(&io->w->key); + op->csum = KEY_CSUM(&io->w->key); - s->op.type = BTREE_REPLACE; - bkey_copy(&s->op.replace, &io->w->key); + bkey_copy(&op->replace_key, &io->w->key); + op->replace = true; - closure_init(&s->op.cl, cl); - bch_insert_data(&s->op.cl); + closure_call(&op->cl, bch_data_insert, NULL, cl); } - continue_at(cl, write_moving_finish, NULL); + continue_at(cl, write_moving_finish, op->wq); } static void read_moving_submit(struct closure *cl) { - struct search *s = container_of(cl, struct search, cl); - struct moving_io *io = container_of(s, struct moving_io, s); + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - bch_submit_bbio(bio, s->op.c, &io->w->key, 0); + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); - continue_at(cl, write_moving, bch_gc_wq); + continue_at(cl, write_moving, io->op.wq); } -static void read_moving(struct closure *cl) +static void read_moving(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, moving_gc); struct keybuf_key *w; struct moving_io *io; struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); /* XXX: if we error, background writeback could stall indefinitely */ @@ -142,6 +143,11 @@ static void read_moving(struct closure *cl) if (!w) break; + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), GFP_KERNEL); @@ -150,8 +156,9 @@ static void read_moving(struct closure *cl) w->private = io; io->w = w; - io->s.op.inode = KEY_INODE(&w->key); - io->s.op.c = c; + io->op.inode = KEY_INODE(&w->key); + io->op.c = c; + io->op.wq = c->moving_gc_wq; moving_init(io); bio = &io->bio.bio; @@ -164,13 +171,8 @@ static void read_moving(struct closure *cl) trace_bcache_gc_copy(&w->key); - closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); - - if (atomic_inc_return(&c->in_flight) >= 64) { - closure_wait_event(&c->moving_gc_wait, cl, - atomic_read(&c->in_flight) < 64); - continue_at(cl, read_moving, bch_gc_wq); - } + down(&c->moving_in_flight); + closure_call(&io->cl, read_moving_submit, NULL, &cl); } if (0) { @@ -180,7 +182,7 @@ err: if (!IS_ERR_OR_NULL(w->private)) bch_keybuf_del(&c->moving_gc_keys, w); } - closure_return(cl); + closure_sync(&cl); } static bool bucket_cmp(struct bucket *l, struct bucket *r) @@ -190,30 +192,33 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) static unsigned bucket_heap_top(struct cache *ca) { - return GC_SECTORS_USED(heap_peek(&ca->heap)); + struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } -void bch_moving_gc(struct closure *cl) +void bch_moving_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); struct cache *ca; struct bucket *b; unsigned i; if (!c->copy_gc_enabled) - closure_return(cl); + return; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { unsigned sectors_to_move = 0; unsigned reserve_sectors = ca->sb.bucket_size * - min(fifo_used(&ca->free), ca->free.size / 2); + fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; for_each_bucket(b, ca) { - if (!GC_SECTORS_USED(b)) + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) continue; if (!heap_full(&ca->heap)) { @@ -233,22 +238,19 @@ void bch_moving_gc(struct closure *cl) sectors_to_move -= GC_SECTORS_USED(b); } - ca->gc_move_threshold = bucket_heap_top(ca); - - pr_debug("threshold %u", ca->gc_move_threshold); + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); } mutex_unlock(&c->bucket_lock); c->moving_gc_keys.last_scanned = ZERO_KEY; - closure_init(&c->moving_gc, cl); - read_moving(&c->moving_gc); - - closure_return(cl); + read_moving(c); } void bch_moving_init_cache_set(struct cache_set *c) { bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 786a1a4f74d..15fff4f68a7 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -12,11 +12,9 @@ #include "request.h" #include "writeback.h" -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> -#include "blk-cgroup.h" #include <trace/events/bcache.h> @@ -25,187 +23,28 @@ struct kmem_cache *bch_search_cache; -static void check_should_skip(struct cached_dev *, struct search *); - -/* Cgroup interface */ - -#ifdef CONFIG_CGROUP_BCACHE -static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; - -static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) -{ - struct cgroup_subsys_state *css; - return cgroup && - (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) -{ - struct cgroup_subsys_state *css = bio->bi_css - ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) - : task_subsys_state(current, bcache_subsys_id); - - return css - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) -{ - char tmp[1024]; - int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, - cgroup_to_bcache(cgrp)->cache_mode + 1); - - if (len < 0) - return len; - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - int v = bch_read_string_list(buf, bch_cache_modes); - if (v < 0) - return v; - - cgroup_to_bcache(cgrp)->cache_mode = v - 1; - return 0; -} - -static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) -{ - return cgroup_to_bcache(cgrp)->verify; -} - -static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - cgroup_to_bcache(cgrp)->verify = val; - return 0; -} - -static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_hits); -} - -static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_misses); -} - -static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_hits); -} - -static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_misses); -} - -static struct cftype bch_files[] = { - { - .name = "cache_mode", - .read = cache_mode_read, - .write_string = cache_mode_write, - }, - { - .name = "verify", - .read_u64 = bch_verify_read, - .write_u64 = bch_verify_write, - }, - { - .name = "cache_hits", - .read_u64 = bch_cache_hits_read, - }, - { - .name = "cache_misses", - .read_u64 = bch_cache_misses_read, - }, - { - .name = "cache_bypass_hits", - .read_u64 = bch_cache_bypass_hits_read, - }, - { - .name = "cache_bypass_misses", - .read_u64 = bch_cache_bypass_misses_read, - }, - { } /* terminate */ -}; - -static void init_bch_cgroup(struct bch_cgroup *cg) -{ - cg->cache_mode = -1; -} - -static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) -{ - struct bch_cgroup *cg; - - cg = kzalloc(sizeof(*cg), GFP_KERNEL); - if (!cg) - return ERR_PTR(-ENOMEM); - init_bch_cgroup(cg); - return &cg->css; -} - -static void bcachecg_destroy(struct cgroup *cgroup) -{ - struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - free_css_id(&bcache_subsys, &cg->css); - kfree(cg); -} - -struct cgroup_subsys bcache_subsys = { - .create = bcachecg_create, - .destroy = bcachecg_destroy, - .subsys_id = bcache_subsys_id, - .name = "bcache", - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(bcache_subsys); -#endif +static void bch_data_insert_start(struct closure *); static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - int r = bch_bio_to_cgroup(bio)->cache_mode; - if (r >= 0) - return r; -#endif return BDEV_CACHE_MODE(&dc->sb); } static bool verify(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - if (bch_bio_to_cgroup(bio)->verify) - return true; -#endif return dc->verify; } static void bio_csum(struct bio *bio, struct bkey *k) { - struct bio_vec *bv; + struct bio_vec bv; + struct bvec_iter iter; uint64_t csum = 0; - int i; - bio_for_each_segment(bv, bio, i) { - void *d = kmap(bv->bv_page) + bv->bv_offset; - csum = bch_crc64_update(csum, d, bv->bv_len); - kunmap(bv->bv_page); + bio_for_each_segment(bv, bio, iter) { + void *d = kmap(bv.bv_page) + bv.bv_offset; + csum = bch_crc64_update(csum, d, bv.bv_len); + kunmap(bv.bv_page); } k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); @@ -213,221 +52,97 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static void bio_invalidate(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct bio *bio = op->cache_bio; - - pr_debug("invalidating %i sectors from %llu", - bio_sectors(bio), (uint64_t) bio->bi_sector); - - while (bio_sectors(bio)) { - unsigned len = min(bio_sectors(bio), 1U << 14); - - if (bch_keylist_realloc(&op->keys, 0, op->c)) - goto out; - - bio->bi_sector += len; - bio->bi_size -= len << 9; - - bch_keylist_add(&op->keys, - &KEY(op->inode, bio->bi_sector, len)); - } - - op->insert_data_done = true; - bio_put(bio); -out: - continue_at(cl, bch_journal, bcache_wq); -} - -struct open_bucket { - struct list_head list; - struct task_struct *last; - unsigned sectors_free; - BKEY_PADDED(key); -}; - -void bch_open_buckets_free(struct cache_set *c) +static void bch_data_insert_keys(struct closure *cl) { - struct open_bucket *b; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + atomic_t *journal_ref = NULL; + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + int ret; - while (!list_empty(&c->data_buckets)) { - b = list_first_entry(&c->data_buckets, - struct open_bucket, list); - list_del(&b->list); - kfree(b); - } -} - -int bch_open_buckets_alloc(struct cache_set *c) -{ - int i; - - spin_lock_init(&c->data_bucket_lock); + /* + * If we're looping, might already be waiting on + * another journal write - can't wait on more than one journal write at + * a time + * + * XXX: this looks wrong + */ +#if 0 + while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) + closure_sync(&s->cl); +#endif - for (i = 0; i < 6; i++) { - struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); - if (!b) - return -ENOMEM; + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); - list_add(&b->list, &c->data_buckets); + ret = bch_btree_insert(op->c, &op->insert_keys, + journal_ref, replace_key); + if (ret == -ESRCH) { + op->replace_collision = true; + } else if (ret) { + op->error = -ENOMEM; + op->insert_data_done = true; } - return 0; -} + if (journal_ref) + atomic_dec_bug(journal_ref); -/* - * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. - * - * The ideas is if you've got multiple tasks pulling data into the cache at the - * same time, you'll get better cache utilization if you try to segregate their - * data and preserve locality. - * - * For example, say you've starting Firefox at the same time you're copying a - * bunch of files. Firefox will likely end up being fairly hot and stay in the - * cache awhile, but the data you copied might not be; if you wrote all that - * data to the same buckets it'd get invalidated at the same time. - * - * Both of those tasks will be doing fairly random IO so we can't rely on - * detecting sequential IO to segregate their data, but going off of the task - * should be a sane heuristic. - */ -static struct open_bucket *pick_data_bucket(struct cache_set *c, - const struct bkey *search, - struct task_struct *task, - struct bkey *alloc) -{ - struct open_bucket *ret, *ret_task = NULL; + if (!op->insert_data_done) + continue_at(cl, bch_data_insert_start, op->wq); - list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) - goto found; - else if (ret->last == task) - ret_task = ret; - - ret = ret_task ?: list_first_entry(&c->data_buckets, - struct open_bucket, list); -found: - if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; - bkey_copy(&ret->key, alloc); - bkey_init(alloc); - } - - if (!ret->sectors_free) - ret = NULL; - - return ret; + bch_keylist_free(&op->insert_keys); + closure_return(cl); } -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many - * sectors were actually allocated. - * - * If s->writeback is true, will not fail. - */ -static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, - struct search *s) +static int bch_keylist_realloc(struct keylist *l, unsigned u64s, + struct cache_set *c) { - struct cache_set *c = s->op.c; - struct open_bucket *b; - BKEY_PADDED(key) alloc; - struct closure cl, *w = NULL; - unsigned i; - - if (s->writeback) { - closure_init_stack(&cl); - w = &cl; - } - - /* - * We might have to allocate a new bucket, which we can't do with a - * spinlock held. So if we have to allocate, we drop the lock, allocate - * and then retry. KEY_PTRS() indicates whether alloc points to - * allocated bucket(s). - */ - - bkey_init(&alloc.key); - spin_lock(&c->data_bucket_lock); - - while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { - unsigned watermark = s->op.write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; - - spin_unlock(&c->data_bucket_lock); - - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) - return false; - - spin_lock(&c->data_bucket_lock); - } + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; /* - * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but - * didn't use it, drop the refcount bch_bucket_alloc_set() took: + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. */ - if (KEY_PTRS(&alloc.key)) - __bkey_put(c, &alloc.key); - - for (i = 0; i < KEY_PTRS(&b->key); i++) - EBUG_ON(ptr_stale(c, &b->key, i)); - - /* Set up the pointer to the space we're allocating: */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; - for (i = 0; i < KEY_PTRS(&b->key); i++) - k->ptr[i] = b->key.ptr[i]; + return __bch_keylist_realloc(l, u64s); +} - sectors = min(sectors, b->sectors_free); +static void bch_data_invalidate(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio; - SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); - SET_KEY_SIZE(k, sectors); - SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); - /* - * Move b to the end of the lru, and keep track of what this bucket was - * last used for: - */ - list_move_tail(&b->list, &c->data_buckets); - bkey_copy_key(&b->key, k); - b->last = s->task; + while (bio_sectors(bio)) { + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); - b->sectors_free -= sectors; + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) + goto out; - for (i = 0; i < KEY_PTRS(&b->key); i++) { - SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + bio->bi_iter.bi_sector += sectors; + bio->bi_iter.bi_size -= sectors << 9; - atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + bch_keylist_add(&op->insert_keys, + &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); } - if (b->sectors_free < c->sb.block_size) - b->sectors_free = 0; - - /* - * k takes refcounts on the buckets it points to until it's inserted - * into the btree, but if we're done with this bucket we just transfer - * get_data_bucket()'s refcount. - */ - if (b->sectors_free) - for (i = 0; i < KEY_PTRS(&b->key); i++) - atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); - - spin_unlock(&c->data_bucket_lock); - return true; + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, op->wq); } -static void bch_insert_data_error(struct closure *cl) +static void bch_data_insert_error(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); /* * Our data write just errored, which means we've got a bunch of keys to @@ -438,35 +153,34 @@ static void bch_insert_data_error(struct closure *cl) * from the keys we'll accomplish just that. */ - struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; - while (src != op->keys.top) { + while (src != op->insert_keys.top) { struct bkey *n = bkey_next(src); SET_KEY_PTRS(src, 0); - bkey_copy(dst, src); + memmove(dst, src, bkey_bytes(src)); dst = bkey_next(dst); src = n; } - op->keys.top = dst; + op->insert_keys.top = dst; - bch_journal(cl); + bch_data_insert_keys(cl); } -static void bch_insert_data_endio(struct bio *bio, int error) +static void bch_data_insert_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); if (error) { /* TODO: We could try to recover from this. */ - if (s->writeback) - s->error = error; - else if (s->write) - set_closure_fn(cl, bch_insert_data_error, bcache_wq); + if (op->writeback) + op->error = error; + else if (!op->replace) + set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } @@ -474,20 +188,19 @@ static void bch_insert_data_endio(struct bio *bio, int error) bch_bbio_endio(op->c, bio, error, "writing data to cache"); } -static void bch_insert_data_loop(struct closure *cl) +static void bch_data_insert_start(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); - struct bio *bio = op->cache_bio, *n; - - if (op->skip) - return bio_invalidate(cl); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); - bch_queue_gc(op->c); + wake_up_gc(op->c); } + if (op->bypass) + return bch_data_invalidate(cl); + /* * Journal writes are marked REQ_FLUSH; if the original write was a * flush, it'll wait on the journal write. @@ -497,29 +210,30 @@ static void bch_insert_data_loop(struct closure *cl) do { unsigned i; struct bkey *k; - struct bio_set *split = s->d - ? s->d->bio_split : op->c->bio_split; + struct bio_set *split = op->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ - if (bch_keylist_realloc(&op->keys, - 1 + (op->csum ? 1 : 0), + if (bch_keylist_realloc(&op->insert_keys, + 3 + (op->csum ? 1 : 0), op->c)) - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); - k = op->keys.top; + k = op->insert_keys.top; bkey_init(k); SET_KEY_INODE(k, op->inode); - SET_KEY_OFFSET(k, bio->bi_sector); + SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); - if (!bch_alloc_sectors(k, bio_sectors(bio), s)) + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) goto err; - n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); + n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); - n->bi_end_io = bch_insert_data_endio; + n->bi_end_io = bch_data_insert_endio; n->bi_private = cl; - if (s->writeback) { + if (op->writeback) { SET_KEY_DIRTY(k, true); for (i = 0; i < KEY_PTRS(k); i++) @@ -532,17 +246,17 @@ static void bch_insert_data_loop(struct closure *cl) bio_csum(n, k); trace_bcache_cache_insert(k); - bch_keylist_push(&op->keys); + bch_keylist_push(&op->insert_keys); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); op->insert_data_done = true; - continue_at(cl, bch_journal, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ - BUG_ON(s->writeback); + BUG_ON(op->writeback); /* * But if it's not a writeback write we'd rather just bail out if @@ -550,15 +264,15 @@ err: * we might be starving btree writes for gc or something. */ - if (s->write) { + if (!op->replace) { /* * Writethrough write: We can't complete the write until we've * updated the index. But we don't want to delay the write while * we wait for buckets to be freed up, so just invalidate the * rest of the write. */ - op->skip = true; - return bio_invalidate(cl); + op->bypass = true; + return bch_data_invalidate(cl); } else { /* * From a cache miss, we can just insert the keys for the data @@ -567,15 +281,15 @@ err: op->insert_data_done = true; bio_put(bio); - if (!bch_keylist_empty(&op->keys)) - continue_at(cl, bch_journal, bcache_wq); + if (!bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_keys, op->wq); else closure_return(cl); } } /** - * bch_insert_data - stick some data in the cache + * bch_data_insert - stick some data in the cache * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only @@ -587,56 +301,176 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in op->cache_bio; bi_sector is used for the key offset, + * It inserts the data in s->cache_bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If op->skip is true, instead of inserting the data it invalidates the region - * of the cache represented by op->cache_bio and op->inode. + * If s->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by s->cache_bio and op->inode. */ -void bch_insert_data(struct closure *cl) +void bch_data_insert(struct closure *cl) { - struct btree_op *op = container_of(cl, struct btree_op, cl); + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - bch_keylist_init(&op->keys); - bio_get(op->cache_bio); - bch_insert_data_loop(cl); + trace_bcache_write(op->bio, op->writeback, op->bypass); + + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); + bch_data_insert_start(cl); } -void bch_btree_insert_async(struct closure *cl) +/* Congested? */ + +unsigned bch_get_congested(struct cache_set *c) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct search *s = container_of(op, struct search, op); + int i; + long rand; - if (bch_btree_insert(op, op->c)) { - s->error = -ENOMEM; - op->insert_data_done = true; - } + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); - if (op->insert_data_done) { - bch_keylist_free(&op->keys); - closure_return(cl); - } else - continue_at(cl, bch_insert_data_loop, bcache_wq); + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } -/* Common code for the make_request functions */ +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); -static void request_endio(struct bio *bio, int error) + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) { - struct closure *cl = bio->bi_private; + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} - if (error) { - struct search *s = container_of(cl, struct search, cl); - s->error = error; - /* Only cache read errors are recoverable */ - s->recoverable = false; +static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + struct cache_set *c = dc->disk.c; + unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); + struct task_struct *task = current; + struct io *i; + + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; } - bio_put(bio); - closure_put(cl); + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) + if (i->last == bio->bi_iter.bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_iter.bi_size > i->sequential) + i->sequential += bio->bi_iter.bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); + return true; } -void bch_cache_read_endio(struct bio *bio, int error) +/* Cache lookup */ + +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned insert_bio_sectors; + unsigned recoverable:1; + unsigned write:1; + unsigned read_dirty_data:1; + + unsigned long start_time; + + struct btree_op op; + struct data_insert_op iop; +}; + +static void bch_cache_read_endio(struct bio *bio, int error) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; @@ -650,13 +484,117 @@ void bch_cache_read_endio(struct bio *bio, int error) */ if (error) - s->error = error; - else if (ptr_stale(s->op.c, &b->key, 0)) { - atomic_long_inc(&s->op.c->cache_read_races); - s->error = -EINTR; + s->iop.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(s->iop.c, &b->key, 0)) { + atomic_long_inc(&s->iop.c->cache_read_races); + s->iop.error = -EINTR; + } + + bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *n, *bio = &s->bio.bio; + struct bkey *bio_key; + unsigned ptr; + + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) + return MAP_CONTINUE; + + if (KEY_INODE(k) != s->iop.inode || + KEY_START(k) > bio->bi_iter.bi_sector) { + unsigned bio_sectors = bio_sectors(bio); + unsigned sectors = KEY_INODE(k) == s->iop.inode + ? min_t(uint64_t, INT_MAX, + KEY_START(k) - bio->bi_iter.bi_sector) + : INT_MAX; + + int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) + return ret; + + /* if this was a complete miss we shouldn't get here */ + BUG_ON(bio_sectors <= sectors); } - bch_bbio_endio(s->op.c, bio, error, "reading from cache"); + if (!KEY_SIZE(k)) + return MAP_CONTINUE; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + + n = bio_next_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_iter.bi_sector), + GFP_NOIO, s->d->bio_split); + + bio_key = &container_of(n, struct bbio, bio)->key; + bch_bkey_copy_single_ptr(bio_key, k, ptr); + + bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + __bch_submit_bbio(n, b->c); + return n == bio ? MAP_DONE : MAP_CONTINUE; +} + +static void cache_lookup(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, iop.cl); + struct bio *bio = &s->bio.bio; + int ret; + + bch_btree_op_init(&s->op, -1); + + ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); + if (ret == -EAGAIN) + continue_at(cl, cache_lookup, bcache_wq); + + closure_return(cl); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + + if (error) { + struct search *s = container_of(cl, struct search, cl); + s->iop.error = error; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); } static void bio_complete(struct search *s) @@ -670,19 +608,21 @@ static void bio_complete(struct search *s) part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); part_stat_unlock(); - trace_bcache_request_end(s, s->orig_bio); - bio_endio(s->orig_bio, s->error); + trace_bcache_request_end(s->d, s->orig_bio); + bio_endio(s->orig_bio, s->iop.error); s->orig_bio = NULL; } } -static void do_bio_hook(struct search *s) +static void do_bio_hook(struct search *s, struct bio *orig_bio) { struct bio *bio = &s->bio.bio; - memcpy(bio, s->orig_bio, sizeof(struct bio)); + bio_init(bio); + __bio_clone_fast(bio, orig_bio); bio->bi_end_io = request_endio; bio->bi_private = &s->cl; + atomic_set(&bio->bi_cnt, 3); } @@ -691,61 +631,44 @@ static void search_free(struct closure *cl) struct search *s = container_of(cl, struct search, cl); bio_complete(s); - if (s->op.cache_bio) - bio_put(s->op.cache_bio); - - if (s->unaligned_bvec) - mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); + if (s->iop.bio) + bio_put(s->iop.bio); closure_debug_destroy(cl); mempool_free(s, s->d->c->search); } -static struct search *search_alloc(struct bio *bio, struct bcache_device *d) +static inline struct search *search_alloc(struct bio *bio, + struct bcache_device *d) { - struct bio_vec *bv; - struct search *s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, op.keys)); + struct search *s; - __closure_init(&s->cl, NULL); + s = mempool_alloc(d->c->search, GFP_NOIO); + + closure_init(&s->cl, NULL); + do_bio_hook(s, bio); - s->op.inode = d->id; - s->op.c = d->c; - s->d = d; - s->op.lock = -1; - s->task = current; s->orig_bio = bio; - s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; - s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; + s->cache_miss = NULL; + s->d = d; s->recoverable = 1; + s->write = (bio->bi_rw & REQ_WRITE) != 0; + s->read_dirty_data = 0; s->start_time = jiffies; - do_bio_hook(s); - if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { - bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); - memcpy(bv, bio_iovec(bio), - sizeof(struct bio_vec) * bio_segments(bio)); - - s->bio.bio.bi_io_vec = bv; - s->unaligned_bvec = 1; - } + s->iop.c = d->c; + s->iop.bio = NULL; + s->iop.inode = d->id; + s->iop.write_point = hash_long((unsigned long) current, 16); + s->iop.write_prio = 0; + s->iop.error = 0; + s->iop.flags = 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.wq = bcache_wq; return s; } -static void btree_read_async(struct closure *cl) -{ - struct btree_op *op = container_of(cl, struct btree_op, cl); - - int ret = btree_root(search_recurse, op->c, op); - - if (ret == -EAGAIN) - continue_at(cl, btree_read_async, bcache_wq); - - closure_return(cl); -} - /* Cached devices */ static void cached_dev_bio_complete(struct closure *cl) @@ -759,190 +682,179 @@ static void cached_dev_bio_complete(struct closure *cl) /* Process reads */ -static void cached_dev_read_complete(struct closure *cl) +static void cached_dev_cache_miss_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - if (s->op.insert_collision) - bch_mark_cache_miss_collision(s); + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->op.cache_bio) { + if (s->iop.bio) { int i; struct bio_vec *bv; - __bio_for_each_segment(bv, s->op.cache_bio, i, 0) + bio_for_each_segment_all(bv, s->iop.bio, i) __free_page(bv->bv_page); } cached_dev_bio_complete(cl); } -static void request_read_error(struct closure *cl) +static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - struct bio_vec *bv; - int i; + struct bio *bio = &s->bio.bio; if (s->recoverable) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->error = 0; - bv = s->bio.bio.bi_io_vec; - do_bio_hook(s); - s->bio.bio.bi_io_vec = bv; - - if (!s->unaligned_bvec) - bio_for_each_segment(bv, s->orig_bio, i) - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; - else - memcpy(s->bio.bio.bi_io_vec, - bio_iovec(s->orig_bio), - sizeof(struct bio_vec) * - bio_segments(s->orig_bio)); + s->iop.error = 0; + do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ - closure_bio_submit(&s->bio.bio, &s->cl, s->d); + closure_bio_submit(bio, cl, s->d); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done(struct closure *cl) +static void cached_dev_read_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* - * s->cache_bio != NULL implies that we had a cache miss; cache_bio now - * contains data ready to be inserted into the cache. + * We had a cache miss; cache_bio now contains data ready to be inserted + * into the cache. * * First, we copy the data we just read from cache_bio's bounce buffers * to the buffers the original bio pointed to: */ - if (s->op.cache_bio) { - bio_reset(s->op.cache_bio); - s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; - s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; - bch_bio_map(s->op.cache_bio, NULL); + if (s->iop.bio) { + bio_reset(s->iop.bio); + s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; + s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + bch_bio_map(s->iop.bio, NULL); - bio_copy_data(s->cache_miss, s->op.cache_bio); + bio_copy_data(s->cache_miss, s->iop.bio); bio_put(s->cache_miss); s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable) - bch_data_verify(s); + if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + bch_data_verify(dc, s->orig_bio); bio_complete(s); - if (s->op.cache_bio && - !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { - s->op.type = BTREE_REPLACE; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + if (s->iop.bio && + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { + BUG_ON(!s->iop.replace); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } - continue_at(cl, cached_dev_read_complete, NULL); + continue_at(cl, cached_dev_cache_miss_done, NULL); } -static void request_read_done_bh(struct closure *cl) +static void cached_dev_read_done_bh(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); - trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); + bch_mark_cache_accounting(s->iop.c, s->d, + !s->cache_miss, s->iop.bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->error) - continue_at_nobarrier(cl, request_read_error, bcache_wq); - else if (s->op.cache_bio || verify(dc, &s->bio.bio)) - continue_at_nobarrier(cl, request_read_done, bcache_wq); + if (s->iop.error) + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); + else if (s->iop.bio || verify(dc, &s->bio.bio)) + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else - continue_at_nobarrier(cl, cached_dev_read_complete, NULL); + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); } static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - int ret = 0; - unsigned reada; + int ret = MAP_CONTINUE; + unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - struct bio *miss; - - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; + struct bio *miss, *cache_bio; - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; - - if (s->cache_miss || s->op.skip) + if (s->cache_miss || s->iop.bypass) { + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; - - if (miss != bio || - (bio->bi_rw & REQ_RAHEAD) || - (bio->bi_rw & REQ_META) || - s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) - reada = 0; - else { - reada = min(dc->readahead >> 9, - sectors - bio_sectors(miss)); - - if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - - bio_end_sector(miss); } - s->cache_bio_sectors = bio_sectors(miss) + reada; - s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, - DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), - dc->disk.bio_split); + if (!(bio->bi_rw & REQ_RAHEAD) && + !(bio->bi_rw & REQ_META) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + reada = min_t(sector_t, dc->readahead >> 9, + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); - if (!s->op.cache_bio) - goto out_submit; + s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + + s->iop.replace_key = KEY(s->iop.inode, + bio->bi_iter.bi_sector + s->insert_bio_sectors, + s->insert_bio_sectors); + + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); + if (ret) + return ret; - s->op.cache_bio->bi_sector = miss->bi_sector; - s->op.cache_bio->bi_bdev = miss->bi_bdev; - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; + s->iop.replace = true; - s->op.cache_bio->bi_end_io = request_endio; - s->op.cache_bio->bi_private = &s->cl; + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ - ret = -EINTR; - if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) - goto out_put; + ret = miss == bio ? MAP_DONE : -EINTR; - bch_bio_map(s->op.cache_bio, NULL); - if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + cache_bio = bio_alloc_bioset(GFP_NOWAIT, + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), + dc->disk.bio_split); + if (!cache_bio) + goto out_submit; + + cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; + cache_bio->bi_bdev = miss->bi_bdev; + cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + + cache_bio->bi_end_io = request_endio; + cache_bio->bi_private = &s->cl; + + bch_bio_map(cache_bio, NULL); + if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - s->cache_miss = miss; - bio_get(s->op.cache_bio); + if (reada) + bch_mark_cache_readahead(s->iop.c, s->d); - closure_bio_submit(s->op.cache_bio, &s->cl, s->d); + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); + closure_bio_submit(cache_bio, &s->cl, s->d); return ret; out_put: - bio_put(s->op.cache_bio); - s->op.cache_bio = NULL; + bio_put(cache_bio); out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; closure_bio_submit(miss, &s->cl, s->d); return ret; } -static void request_read(struct cached_dev *dc, struct search *s) +static void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; - check_should_skip(dc, s); - closure_call(&s->op.cl, btree_read_async, NULL, cl); - - continue_at(cl, request_read_done_bh, NULL); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + continue_at(cl, cached_dev_read_done_bh, NULL); } /* Process writes */ @@ -956,88 +868,84 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static void request_write(struct cached_dev *dc, struct search *s) +static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; - struct bkey start, end; - start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end_sector(bio), 0); + struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); - check_should_skip(dc, s); down_read_non_owner(&dc->writeback_lock); - if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { - s->op.skip = false; - s->writeback = true; + /* + * We overlap with some dirty data undergoing background + * writeback, force this write to writeback + */ + s->iop.bypass = false; + s->iop.writeback = true; } + /* + * Discards aren't _required_ to do anything, so skipping if + * check_overlapping returned true is ok + * + * But check_overlapping drops dirty keys for which io hasn't started, + * so we still want to call it. + */ if (bio->bi_rw & REQ_DISCARD) - goto skip; + s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, cache_mode(dc, bio), - s->op.skip)) { - s->op.skip = false; - s->writeback = true; + s->iop.bypass)) { + s->iop.bypass = false; + s->iop.writeback = true; } - if (s->op.skip) - goto skip; - - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + if (s->iop.bypass) { + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); - if (!s->writeback) { - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); - - closure_bio_submit(bio, cl, s->d); - } else { + if (!(bio->bi_rw & REQ_DISCARD) || + blk_queue_discard(bdev_get_queue(dc->bdev))) + closure_bio_submit(bio, cl, s->d); + } else if (s->iop.writeback) { bch_writeback_add(dc); + s->iop.bio = bio; - if (s->op.flush_journal) { + if (bio->bi_rw & REQ_FLUSH) { /* Also need to send a flush to the backing device */ - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc->disk.bio_split); + struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); - bio->bi_size = 0; - bio->bi_vcnt = 0; - closure_bio_submit(bio, cl, s->d); - } else { - s->op.cache_bio = bio; + flush->bi_rw = WRITE_FLUSH; + flush->bi_bdev = bio->bi_bdev; + flush->bi_end_io = request_endio; + flush->bi_private = cl; + + closure_bio_submit(flush, cl, s->d); } - } -out: - closure_call(&s->op.cl, bch_insert_data, NULL, cl); - continue_at(cl, cached_dev_write_complete, NULL); -skip: - s->op.skip = true; - s->op.cache_bio = s->orig_bio; - bio_get(s->op.cache_bio); + } else { + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - if ((bio->bi_rw & REQ_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) - goto out; + closure_bio_submit(bio, cl, s->d); + } - closure_bio_submit(bio, cl, s->d); - goto out; + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); } -static void request_nodata(struct cached_dev *dc, struct search *s) +static void cached_dev_nodata(struct closure *cl) { - struct closure *cl = &s->cl; + struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; - if (bio->bi_rw & REQ_DISCARD) { - request_write(dc, s); - return; - } - - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + /* If it's a flush, we send the flush to the backing device too */ closure_bio_submit(bio, cl, s->d); continue_at(cl, cached_dev_bio_complete, NULL); @@ -1045,134 +953,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -unsigned bch_get_congested(struct cache_set *c) -{ - int i; - long rand; - - if (!c->congested_read_threshold_us && - !c->congested_write_threshold_us) - return 0; - - i = (local_clock_us() - c->congested_last_us) / 1024; - if (i < 0) - return 0; - - i += atomic_read(&c->congested); - if (i >= 0) - return 0; - - i += CONGESTED_MAX; - - if (i > 0) - i = fract_exp_two(i, 6); - - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); - - return i > 0 ? i : 1; -} - -static void add_sequential(struct task_struct *t) -{ - ewma_add(t->sequential_io_avg, - t->sequential_io, 8, 0); - - t->sequential_io = 0; -} - -static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) -{ - return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; -} - -static void check_should_skip(struct cached_dev *dc, struct search *s) -{ - struct cache_set *c = s->op.c; - struct bio *bio = &s->bio.bio; - unsigned mode = cache_mode(dc, bio); - unsigned sectors, congested = bch_get_congested(c); - - if (atomic_read(&dc->disk.detaching) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || - (bio->bi_rw & REQ_DISCARD)) - goto skip; - - if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - (bio->bi_rw & REQ_WRITE))) - goto skip; - - if (bio->bi_sector & (c->sb.block_size - 1) || - bio_sectors(bio) & (c->sb.block_size - 1)) { - pr_debug("skipping unaligned io"); - goto skip; - } - - if (!congested && !dc->sequential_cutoff) - goto rescale; - - if (!congested && - mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - - if (dc->sequential_merge) { - struct io *i; - - spin_lock(&dc->io_lock); - - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; - - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(s->task); - i->sequential = 0; -found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - s->task->sequential_io = i->sequential; - - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); - - spin_unlock(&dc->io_lock); - } else { - s->task->sequential_io = bio->bi_size; - - add_sequential(s->task); - } - - sectors = max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9; - - if (dc->sequential_cutoff && - sectors >= dc->sequential_cutoff >> 9) { - trace_bcache_bypass_sequential(s->orig_bio); - goto skip; - } - - if (congested && sectors >= congested) { - trace_bcache_bypass_congested(s->orig_bio); - goto skip; - } - -rescale: - bch_rescale_priorities(c, bio_sectors(bio)); - return; -skip: - bch_mark_sectors_bypassed(s, bio_sectors(bio)); - s->op.skip = true; -} - static void cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; @@ -1186,18 +966,28 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) part_stat_unlock(); bio->bi_bdev = dc->bdev; - bio->bi_sector += dc->sb.data_offset; + bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { s = search_alloc(bio, d); - trace_bcache_request_start(s, bio); + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + cached_dev_nodata, + bcache_wq); + } else { + s->iop.bypass = check_should_bypass(dc, bio); - if (!bio_has_data(bio)) - request_nodata(dc, s); - else if (rw) - request_write(dc, s); - else - request_read(dc, s); + if (rw) + cached_dev_write(dc, s); + else + cached_dev_read(dc, s); + } } else { if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1254,27 +1044,28 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - struct bio_vec *bv; - int i; + unsigned bytes = min(sectors, bio_sectors(bio)) << 9; - /* Zero fill bio */ + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); - bio_for_each_segment(bv, bio, i) { - unsigned j = min(bv->bv_len >> 9, sectors); + bio_advance(bio, bytes); - void *p = kmap(bv->bv_page); - memset(p + bv->bv_offset, 0, j << 9); - kunmap(bv->bv_page); + if (!bio->bi_iter.bi_size) + return MAP_DONE; - sectors -= j; - } + return MAP_CONTINUE; +} - bio_advance(bio, min(sectors << 9, bio->bi_size)); +static void flash_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); - if (!bio->bi_size) - s->op.lookup_done = true; + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); - return 0; + continue_at(cl, search_free, NULL); } static void flash_dev_make_request(struct request_queue *q, struct bio *bio) @@ -1293,23 +1084,28 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) cl = &s->cl; bio = &s->bio.bio; - trace_bcache_request_start(s, bio); + trace_bcache_request_start(s->d, bio); - if (bio_has_data(bio) && !rw) { - closure_call(&s->op.cl, btree_read_async, NULL, cl); - } else if (bio_has_data(bio) || s->op.skip) { - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, - &KEY(d->id, bio->bi_sector, 0), + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + flash_dev_nodata, + bcache_wq); + } else if (rw) { + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, + &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); - s->writeback = true; - s->op.cache_bio = bio; + s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; + s->iop.writeback = true; + s->iop.bio = bio; - closure_call(&s->op.cl, bch_insert_data, NULL, cl); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); } else { - /* No data - probably a cache flush */ - if (s->op.flush_journal) - bch_journal_meta(s->op.c, cl); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); } continue_at(cl, search_free, NULL); @@ -1349,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) void bch_request_exit(void) { -#ifdef CONFIG_CGROUP_BCACHE - cgroup_unload_subsys(&bcache_subsys); -#endif if (bch_search_cache) kmem_cache_destroy(bch_search_cache); } @@ -1362,11 +1155,5 @@ int __init bch_request_init(void) if (!bch_search_cache) return -ENOMEM; -#ifdef CONFIG_CGROUP_BCACHE - cgroup_load_subsys(&bcache_subsys); - init_bch_cgroup(&bcache_default_cgroup); - - cgroup_add_cftypes(&bcache_subsys, bch_files); -#endif return 0; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 57dc4784f4f..1ff36875c2b 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,62 +1,43 @@ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ -#include <linux/cgroup.h> - -struct search { - /* Stack frame for bio_complete */ +struct data_insert_op { struct closure cl; + struct cache_set *c; + struct bio *bio; + struct workqueue_struct *wq; - struct bcache_device *d; - struct task_struct *task; + unsigned inode; + uint16_t write_point; + uint16_t write_prio; + short error; - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - unsigned cache_bio_sectors; + union { + uint16_t flags; - unsigned recoverable:1; - unsigned unaligned_bvec:1; + struct { + unsigned bypass:1; + unsigned writeback:1; + unsigned flush_journal:1; + unsigned csum:1; - unsigned write:1; - unsigned writeback:1; + unsigned replace:1; + unsigned replace_collision:1; - /* IO error returned to s->bio */ - short error; - unsigned long start_time; + unsigned insert_data_done:1; + }; + }; - /* Anything past op->keys won't get zeroed in do_bio_hook */ - struct btree_op op; + struct keylist insert_keys; + BKEY_PADDED(replace_key); }; -void bch_cache_read_endio(struct bio *, int); unsigned bch_get_congested(struct cache_set *); -void bch_insert_data(struct closure *cl); -void bch_btree_insert_async(struct closure *); -void bch_cache_read_endio(struct bio *, int); - -void bch_open_buckets_free(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); +void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d); extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; -struct bch_cgroup { -#ifdef CONFIG_CGROUP_BCACHE - struct cgroup_subsys_state css; -#endif - /* - * We subtract one from the index into bch_cache_modes[], so that - * default == -1; this makes it so the rest match up with d->cache_mode, - * and we use d->cache_mode if cgrp->cache_mode < 0 - */ - short cache_mode; - bool verify; - struct cache_stat_collector stats; -}; - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); - #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index b8730e714d6..0ca072c20d0 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -7,7 +7,6 @@ #include "bcache.h" #include "stats.h" #include "btree.h" -#include "request.h" #include "sysfs.h" /* @@ -196,35 +195,33 @@ static void mark_cache_stats(struct cache_stat_collector *stats, atomic_inc(&stats->cache_bypass_misses); } -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); -#ifdef CONFIG_CGROUP_BCACHE - mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); -#endif + mark_cache_stats(&c->accounting.collector, hit, bypass); } -void bch_mark_cache_readahead(struct search *s) +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&s->op.c->accounting.collector.cache_readaheads); + atomic_inc(&c->accounting.collector.cache_readaheads); } -void bch_mark_cache_miss_collision(struct search *s) +void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct cached_dev *dc = container_of(d, struct cached_dev, disk); atomic_inc(&dc->accounting.collector.cache_miss_collisions); - atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); + atomic_inc(&c->accounting.collector.cache_miss_collisions); } -void bch_mark_sectors_bypassed(struct search *s, int sectors) +void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, + int sectors) { - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); } void bch_cache_accounting_init(struct cache_accounting *acc, diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index c7c7a8fd29f..adbff141c88 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -38,7 +38,9 @@ struct cache_accounting { struct cache_stats day; }; -struct search; +struct cache_set; +struct cached_dev; +struct bcache_device; void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent); @@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); void bch_cache_accounting_destroy(struct cache_accounting *acc); -void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); -void bch_mark_cache_readahead(struct search *s); -void bch_mark_cache_miss_collision(struct search *s); -void bch_mark_sectors_bypassed(struct search *s, int sectors); +void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, + bool, bool); +void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); +void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); +void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c57b05..926ded8ccbf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include "request.h" #include "writeback.h" @@ -16,6 +17,7 @@ #include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> +#include <linux/idr.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -45,21 +47,13 @@ const char * const bch_cache_modes[] = { NULL }; -struct uuid_entry_v0 { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - uint32_t pad; -}; - static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); -static int bcache_major, bcache_minor; +static int bcache_major; +static DEFINE_IDA(bcache_minor); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; @@ -232,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ - closure_put(&dc->sb_write.cl); + closure_put(&dc->sb_write); } static void __write_super(struct cache_sb *sb, struct bio *bio) @@ -240,9 +234,9 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); unsigned i; - bio->bi_sector = SB_SECTOR; - bio->bi_rw = REQ_SYNC|REQ_META; - bio->bi_size = SB_SIZE; + bio->bi_iter.bi_sector = SB_SECTOR; + bio->bi_rw = REQ_SYNC|REQ_META; + bio->bi_iter.bi_size = SB_SIZE; bch_bio_map(bio, NULL); out->offset = cpu_to_le64(sb->offset); @@ -270,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) submit_bio(REQ_WRITE, bio); } +static void bch_write_bdev_super_unlock(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + + up(&dc->sb_write_mutex); +} + void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) { - struct closure *cl = &dc->sb_write.cl; + struct closure *cl = &dc->sb_write; struct bio *bio = &dc->sb_bio; - closure_lock(&dc->sb_write, parent); + down(&dc->sb_write_mutex); + closure_init(cl, parent); bio_reset(bio); bio->bi_bdev = dc->bdev; @@ -285,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_get(cl); __write_super(&dc->sb, bio); - closure_return(cl); + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } static void write_super_endio(struct bio *bio, int error) @@ -293,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error) struct cache *ca = bio->bi_private; bch_count_io_errors(ca, error, "writing superblock"); - closure_put(&ca->set->sb_write.cl); + closure_put(&ca->set->sb_write); +} + +static void bcache_write_super_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, sb_write); + + up(&c->sb_write_mutex); } void bcache_write_super(struct cache_set *c) { - struct closure *cl = &c->sb_write.cl; + struct closure *cl = &c->sb_write; struct cache *ca; unsigned i; - closure_lock(&c->sb_write, &c->cl); + down(&c->sb_write_mutex); + closure_init(cl, &c->cl); c->sb.seq++; @@ -324,7 +334,7 @@ void bcache_write_super(struct cache_set *c) __write_super(&ca->sb, bio); } - closure_return(cl); + closure_return_with_destructor(cl, bcache_write_super_unlock); } /* UUID io */ @@ -332,29 +342,37 @@ void bcache_write_super(struct cache_set *c) static void uuid_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); cache_set_err_on(error, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } +static void uuid_io_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + up(&c->uuid_write_mutex); +} + static void uuid_io(struct cache_set *c, unsigned long rw, struct bkey *k, struct closure *parent) { - struct closure *cl = &c->uuid_write.cl; + struct closure *cl = &c->uuid_write; struct uuid_entry *u; unsigned i; char buf[80]; BUG_ON(!parent); - closure_lock(&c->uuid_write, parent); + down(&c->uuid_write_mutex); + closure_init(cl, parent); for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); bio->bi_rw = REQ_SYNC|REQ_META|rw; - bio->bi_size = KEY_SIZE(k) << 9; + bio->bi_iter.bi_size = KEY_SIZE(k) << 9; bio->bi_end_io = uuid_endio; bio->bi_private = cl; @@ -366,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) @@ -375,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw, u - c->uuids, u->uuid, u->label, u->first_reg, u->last_reg, u->invalidated); - closure_return(cl); + closure_return_with_destructor(cl, uuid_io_unlock); } static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (__bch_ptr_invalid(c, 1, k)) + if (__bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -427,7 +445,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -435,7 +453,7 @@ static int __uuid_write(struct cache_set *c) closure_sync(&cl); bkey_copy(&c->uuid_bucket, &k.key); - __bkey_put(c, &k.key); + bkey_put(c, &k.key); return 0; } @@ -510,10 +528,10 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) closure_init_stack(cl); - bio->bi_sector = bucket * ca->sb.bucket_size; - bio->bi_bdev = ca->bdev; - bio->bi_rw = REQ_SYNC|REQ_META|rw; - bio->bi_size = bucket_bytes(ca); + bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_SYNC|REQ_META|rw; + bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = prio_endio; bio->bi_private = ca; @@ -523,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) closure_sync(cl); } -#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ - fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) - void bch_prio_write(struct cache *ca) { int i; @@ -536,17 +551,13 @@ void bch_prio_write(struct cache *ca) lockdep_assert_held(&ca->set->bucket_lock); - for (b = ca->buckets; - b < ca->buckets + ca->sb.nbuckets; b++) - b->disk_gen = b->gen; - ca->disk_buckets->seq++; atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), + // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; @@ -562,10 +573,10 @@ void bch_prio_write(struct cache *ca) } p->next_bucket = ca->prio_buckets[i + 1]; - p->magic = pset_magic(ca); + p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -583,14 +594,17 @@ void bch_prio_write(struct cache *ca) mutex_lock(&ca->set->bucket_lock); - ca->need_save_prio = 0; - /* * Don't want the old priorities to get garbage collected until after we * finish writing the new ones, and they're journalled */ - for (i = 0; i < prio_buckets(ca); i++) + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } } static void prio_read(struct cache *ca, uint64_t bucket) @@ -613,7 +627,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); - if (p->magic != pset_magic(ca)) + if (p->magic != pset_magic(&ca->sb)) pr_warn("bad magic reading priorities"); bucket = p->next_bucket; @@ -621,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) } b->prio = le16_to_cpu(d->prio); - b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; + b->gen = b->last_gc = d->gen; } } @@ -630,7 +644,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; - if (atomic_read(&d->closing)) + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; closure_get(&d->cl); @@ -659,20 +673,24 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { - if (!atomic_xchg(&d->closing, 1)) + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) closure_queue(&d->cl); } static void bcache_device_unlink(struct bcache_device *d) { - unsigned i; - struct cache *ca; + lockdep_assert_held(&bch_register_lock); - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + unsigned i; + struct cache *ca; - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); + } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, @@ -696,19 +714,16 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&d->detaching)) { + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); u->invalidated = cpu_to_le32(get_seconds()); bch_uuid_write(d->c); - - atomic_set(&d->detaching, 0); } - if (!d->flush_done) - bcache_device_unlink(d); + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -739,14 +754,18 @@ static void bcache_device_free(struct bcache_device *d) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); - if (d->disk) + if (d->disk) { + ida_simple_remove(&bcache_minor, d->disk->first_minor); put_disk(d->disk); + } bio_split_pool_free(&d->bio_split_hook); - if (d->unaligned_bvec) - mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->full_dirty_stripes)) + vfree(d->full_dirty_stripes); + else + kfree(d->full_dirty_stripes); if (is_vmalloc_addr(d->stripe_sectors_dirty)) vfree(d->stripe_sectors_dirty); else @@ -760,15 +779,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; + int minor; - if (!d->stripe_size_bits) - d->stripe_size_bits = 31; + if (!d->stripe_size) + d->stripe_size = 1 << 31; - d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> - d->stripe_size_bits; + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large"); return -ENOMEM; + } n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = n < PAGE_SIZE << 6 @@ -777,22 +800,36 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->stripe_sectors_dirty) return -ENOMEM; + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) + return -ENOMEM; + + minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); + if (minor < 0) + return minor; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, - sizeof(struct bio_vec) * BIO_MAX_PAGES)) || bio_split_pool_init(&d->bio_split_hook) || - !(d->disk = alloc_disk(1)) || - !(q = blk_alloc_queue(GFP_KERNEL))) + !(d->disk = alloc_disk(1))) { + ida_simple_remove(&bcache_minor, minor); return -ENOMEM; + } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); d->disk->major = bcache_major; - d->disk->first_minor = bcache_minor++; + d->disk->first_minor = minor; d->disk->fops = &bcache_ops; d->disk->private_data = d; + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return -ENOMEM; + blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; @@ -802,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.max_segment_size = UINT_MAX; q->limits.max_segments = BIO_MAX_PAGES; q->limits.max_discard_sectors = UINT_MAX; + q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; @@ -874,7 +912,7 @@ static void cached_dev_detach_finish(struct work_struct *w) struct closure cl; closure_init_stack(&cl); - BUG_ON(!atomic_read(&dc->disk.detaching)); + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(atomic_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -888,6 +926,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + mutex_unlock(&bch_register_lock); pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); @@ -900,10 +940,10 @@ void bch_cached_dev_detach(struct cached_dev *dc) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&dc->disk.closing)) + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return; - if (atomic_xchg(&dc->disk.detaching, 1)) + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) return; /* @@ -1030,6 +1070,7 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -1058,11 +1099,7 @@ static void cached_dev_flush(struct closure *cl) struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); - d->flush_done = 1; - - if (d->c) - bcache_device_unlink(d); - + bcache_device_unlink(d); mutex_unlock(&bch_register_lock); bch_cache_accounting_destroy(&dc->accounting); @@ -1083,12 +1120,11 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); - closure_init_unlocked(&dc->sb_write); + sema_init(&dc->sb_write_mutex, 1); INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; for (io = dc->io; io < dc->io + RECENT_IO; io++) { @@ -1096,6 +1132,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } + dc->disk.stripe_size = q->limits.io_opt >> 9; + + if (dc->disk.stripe_size) + dc->partial_stripes_expensive = + q->limits.raid_partial_stripes_expensive; + ret = bcache_device_init(&dc->disk, block_size, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) @@ -1260,7 +1302,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) { va_list args; - if (test_bit(CACHE_SET_STOPPING, &c->flags)) + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) return false; /* XXX: we can be called from atomic context @@ -1275,6 +1318,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) printk(", disabling caching\n"); + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + bch_cache_set_unregister(c); return true; } @@ -1303,9 +1349,11 @@ static void cache_set_free(struct closure *cl) if (ca) kobject_put(&ca->kobj); + bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); - free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); + if (c->moving_gc_wq) + destroy_workqueue(c->moving_gc_wq); if (c->bio_split) bioset_free(c->bio_split); if (c->fill_iter) @@ -1339,18 +1387,28 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); + if (c->gc_thread) + kthread_stop(c->gc_thread); + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) kthread_stop(ca->alloc_thread); + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + closure_return(cl); } @@ -1426,19 +1484,21 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; + c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - c->sort_crit_factor = int_sqrt(c->btree_pages); - + sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time_lock); - closure_init_unlocked(&c->sb_write); - closure_init_unlocked(&c->uuid_write); - spin_lock_init(&c->btree_read_time_lock); + init_waitqueue_head(&c->btree_cache_wait); + init_waitqueue_head(&c->bucket_wait); + sema_init(&c->uuid_write_mutex, 1); + + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + bch_moving_init_cache_set(c); INIT_LIST_HEAD(&c->list); @@ -1461,11 +1521,12 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->moving_gc_wq = create_workqueue("bcache_gc")) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c)) + bch_open_buckets_alloc(c) || + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -1483,11 +1544,10 @@ static void run_cache_set(struct cache_set *c) const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; struct cache *ca; + struct closure cl; unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); - op.lock = SHRT_MAX; + closure_init_stack(&cl); for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; @@ -1498,7 +1558,7 @@ static void run_cache_set(struct cache_set *c) struct jset *j; err = "cannot allocate memory for journal"; - if (bch_journal_read(c, &journal, &op)) + if (bch_journal_read(c, &journal)) goto err; pr_debug("btree_journal_read() done"); @@ -1522,27 +1582,27 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (__bch_ptr_invalid(c, j->btree_level + 1, k)) + if (__bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, &op); + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; list_del_init(&c->root->list); rw_unlock(true, c->root); - err = uuid_read(c, j, &op.cl); + err = uuid_read(c, j, &cl); if (err) goto err; err = "error in recovery"; - if (bch_btree_check(c, &op)) + if (bch_btree_check(c)) goto err; bch_journal_mark(c, &journal); - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); pr_debug("btree_check() done"); /* @@ -1570,11 +1630,9 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal, &op); + bch_journal_replay(c, &journal); } else { pr_notice("invalidating existing data"); - /* Don't want invalidate_buckets() to queue a gc yet */ - closure_lock(&c->gc, NULL); for_each_cache(ca, c, i) { unsigned j; @@ -1586,7 +1644,7 @@ static void run_cache_set(struct cache_set *c) ca->sb.d[j] = ca->sb.first_bucket + j; } - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); err = "error starting allocator thread"; for_each_cache(ca, c, i) @@ -1600,15 +1658,17 @@ static void run_cache_set(struct cache_set *c) err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) - goto err_unlock_gc; + goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, &op.cl); + c->root = bch_btree_node_alloc(c, NULL, 0); if (IS_ERR_OR_NULL(c->root)) - goto err_unlock_gc; + goto err; + mutex_lock(&c->root->write_lock); bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_node_write(c->root, &op.cl); + bch_btree_node_write(c->root, &cl); + mutex_unlock(&c->root->write_lock); bch_btree_set_root(c->root); rw_unlock(true, c->root); @@ -1621,14 +1681,14 @@ static void run_cache_set(struct cache_set *c) SET_CACHE_SYNC(&c->sb, true); bch_journal_next(&c->journal); - bch_journal_meta(c, &op.cl); - - /* Unlock */ - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); + bch_journal_meta(c, &cl); } - closure_sync(&op.cl); + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + + closure_sync(&cl); c->sb.last_mount = get_seconds(); bcache_write_super(c); @@ -1638,19 +1698,16 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); return; -err_unlock_gc: - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); err: - closure_sync(&op.cl); + closure_sync(&cl); /* XXX: test this, it's broken */ - bch_cache_set_error(c, err); + bch_cache_set_error(c, "%s", err); } static bool can_attach_cache(struct cache *ca, struct cache_set *c) { return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.bucket_size && ca->sb.nr_in_set == c->sb.nr_in_set; } @@ -1721,12 +1778,11 @@ err: void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); + unsigned i; if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; - bch_cache_allocator_exit(ca); - bio_split_pool_free(&ca->bio_split_hook); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); @@ -1734,9 +1790,10 @@ void bch_cache_release(struct kobject *kobj) vfree(ca->buckets); free_heap(&ca->heap); - free_fifo(&ca->unused); free_fifo(&ca->free_inc); - free_fifo(&ca->free); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); if (ca->sb_bio.bi_inline_vecs[0].bv_page) put_page(ca->sb_bio.bi_io_vec[0].bv_page); @@ -1758,18 +1815,17 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; - free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; - free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free, free, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || - !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || @@ -1784,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) for_each_bucket(b, ca) atomic_set(&b->pin, 0); - if (bch_cache_allocator_init(ca)) - goto err; - return 0; -err: - kobject_put(&ca->kobj); - return -ENOMEM; } static void register_cache(struct cache_sb *sb, struct page *sb_page, @@ -1819,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; + mutex_lock(&bch_register_lock); err = register_cache_set(ca); + mutex_unlock(&bch_register_lock); + if (err) goto err; @@ -1881,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - mutex_lock(&bch_register_lock); - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) goto err; @@ -1915,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!dc) goto err_close; + mutex_lock(&bch_register_lock); register_bdev(sb, sb_page, bdev, dc); + mutex_unlock(&bch_register_lock); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) @@ -1928,7 +1981,6 @@ out: put_page(sb_page); kfree(sb); kfree(path); - mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; @@ -2006,14 +2058,13 @@ static struct notifier_block reboot = { static void bcache_exit(void) { bch_debug_exit(); - bch_writeback_exit(); bch_request_exit(); - bch_btree_exit(); if (bcache_kobj) kobject_put(bcache_kobj); if (bcache_wq) destroy_workqueue(bcache_wq); - unregister_blkdev(bcache_major, "bcache"); + if (bcache_major) + unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); } @@ -2037,9 +2088,7 @@ static int __init bcache_init(void) if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || sysfs_create_files(bcache_kobj, files) || - bch_btree_init() || bch_request_init() || - bch_writeback_init() || bch_debug_init(bcache_kobj)) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4fe6ab2fbe2..b3ff57d61dd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = { NULL }; +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -48,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); sysfs_time_stats_attribute(btree_sort, ms, us); sysfs_time_stats_attribute(btree_read, ms, us); -sysfs_time_stats_attribute(try_harder, ms, us); read_attribute(btree_nodes); read_attribute(btree_used_percent); @@ -66,7 +71,6 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); -rw_attribute(sequential_merge); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(writeback_metadata); @@ -78,7 +82,6 @@ rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_d_term); rw_attribute(writeback_rate_p_term_inverse); -rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -90,12 +93,14 @@ rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(readahead); +rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); +rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); -rw_attribute(freelist_percent); +rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); @@ -116,44 +121,54 @@ SHOW(__bch_cached_dev) sysfs_printf(data_csum, "%i", dc->disk.data_csum); var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_print(writeback_rate, dc->writeback_rate.rate); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); var_print(writeback_rate_d_term); var_print(writeback_rate_p_term_inverse); - var_print(writeback_rate_d_smooth); if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; char dirty[20]; - char derivative[20]; char target[20]; - bch_hprint(dirty, - bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); return sprintf(buf, - "rate:\t\t%u\n" - "change:\t\t%i\n" + "rate:\t\t%s/sec\n" "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" "derivative:\t%s\n" - "target:\t\t%s\n", - dc->writeback_rate.rate, - dc->writeback_rate_change, - dirty, derivative, target); + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); } sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); - var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -181,25 +196,25 @@ STORE(__cached_dev) struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); + d_strtoul(bypass_torture_test); d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - d_strtoul(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); d_strtoul(writeback_rate_d_term); - d_strtoul(writeback_rate_p_term_inverse); - sysfs_strtoul_clamp(writeback_rate_p_term_inverse, - dc->writeback_rate_p_term_inverse, 1, INT_MAX); - d_strtoul(writeback_rate_d_smooth); + d_strtoul_nonzero(writeback_rate_p_term_inverse); - d_strtoul(sequential_merge); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -223,8 +238,13 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { - /* note: endlines are preserved */ - memcpy(dc->sb.label, buf, SB_LABEL_SIZE); + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; bch_write_bdev_super(dc, NULL); if (dc->disk.c) { memcpy(dc->disk.c->uuids[dc->disk.id].label, @@ -300,13 +320,11 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_d_term, &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, - &sysfs_sequential_merge, &sysfs_clear_stats, &sysfs_running, &sysfs_state, @@ -314,6 +332,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, + &sysfs_bypass_torture_test, #endif NULL }; @@ -361,7 +380,7 @@ STORE(__bch_flash_dev) } if (attr == &sysfs_unregister) { - atomic_set(&d->detaching, 1); + set_bit(BCACHE_DEV_DETACHING, &d->flags); bcache_device_stop(d); } @@ -380,81 +399,123 @@ static struct attribute *bch_flash_dev_files[] = { }; KTYPE(bch_flash_dev); -SHOW(__bch_cache_set) +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) { - unsigned root_usage(struct cache_set *c) - { - unsigned bytes = 0; - struct bkey *k; - struct btree *b; - struct btree_iter iter; + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); - goto lock_root; + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); - do { - rw_unlock(false, b); -lock_root: - b = c->root; - rw_lock(false, b, b->level); - } while (b != c->root); + return MAP_CONTINUE; +} - for_each_key_filter(b, k, &iter, bch_ptr_bad) - bytes += bkey_bytes(k); +static int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + +static unsigned bch_root_usage(struct cache_set *c) +{ + unsigned bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; + + goto lock_root; + do { rw_unlock(false, b); +lock_root: + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); - return (bytes * 100) / btree_bytes(c); - } + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); - size_t cache_size(struct cache_set *c) - { - size_t ret = 0; - struct btree *b; + rw_unlock(false, b); - mutex_lock(&c->bucket_lock); - list_for_each_entry(b, &c->btree_cache, list) - ret += 1 << (b->page_order + PAGE_SHIFT); + return (bytes * 100) / btree_bytes(c); +} - mutex_unlock(&c->bucket_lock); - return ret; - } +static size_t bch_cache_size(struct cache_set *c) +{ + size_t ret = 0; + struct btree *b; - unsigned cache_max_chain(struct cache_set *c) - { - unsigned ret = 0; - struct hlist_head *h; + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->keys.page_order + PAGE_SHIFT); - mutex_lock(&c->bucket_lock); + mutex_unlock(&c->bucket_lock); + return ret; +} - for (h = c->bucket_hash; - h < c->bucket_hash + (1 << BUCKET_HASH_BITS); - h++) { - unsigned i = 0; - struct hlist_node *p; +static unsigned bch_cache_max_chain(struct cache_set *c) +{ + unsigned ret = 0; + struct hlist_head *h; - hlist_for_each(p, h) - i++; + mutex_lock(&c->bucket_lock); - ret = max(ret, i); - } + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned i = 0; + struct hlist_node *p; - mutex_unlock(&c->bucket_lock); - return ret; - } + hlist_for_each(p, h) + i++; - unsigned btree_used(struct cache_set *c) - { - return div64_u64(c->gc_stats.key_bytes * 100, - (c->gc_stats.nodes ?: 1) * btree_bytes(c)); + ret = max(ret, i); } - unsigned average_key_size(struct cache_set *c) - { - return c->gc_stats.nkeys - ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) - : 0; - } + mutex_unlock(&c->bucket_lock); + return ret; +} + +static unsigned bch_btree_used(struct cache_set *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} + +static unsigned bch_average_key_size(struct cache_set *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} +SHOW(__bch_cache_set) +{ struct cache_set *c = container_of(kobj, struct cache_set, kobj); sysfs_print(synchronous, CACHE_SYNC(&c->sb)); @@ -462,22 +523,20 @@ lock_root: sysfs_hprint(bucket_size, bucket_bytes(c)); sysfs_hprint(block_size, block_bytes(c)); sysfs_print(tree_depth, c->root->level); - sysfs_print(root_usage_percent, root_usage(c)); + sysfs_print(root_usage_percent, bch_root_usage(c)); - sysfs_hprint(btree_cache_size, cache_size(c)); - sysfs_print(btree_cache_max_chain, cache_max_chain(c)); + sysfs_hprint(btree_cache_size, bch_cache_size(c)); + sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); - sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); - sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); - sysfs_print(btree_used_percent, btree_used(c)); + sysfs_print(btree_used_percent, bch_btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(dirty_data, c->gc_stats.dirty); - sysfs_hprint(average_key_size, average_key_size(c)); + sysfs_hprint(average_key_size, bch_average_key_size(c)); sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); @@ -487,6 +546,10 @@ lock_root: sysfs_print(writeback_keys_failed, atomic_long_read(&c->writeback_keys_failed)); + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); @@ -501,6 +564,8 @@ lock_root: sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -550,7 +615,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_trigger_gc) - bch_queue_gc(c); + wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -564,6 +629,15 @@ STORE(__bch_cache_set) sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + if (attr == &sysfs_errors) { + ssize_t v = bch_read_string_list(buf, error_actions); + + if (v < 0) + return v; + + c->on_error = v; + } + if (attr == &sysfs_io_error_limit) c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; @@ -574,6 +648,7 @@ STORE(__bch_cache_set) sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); @@ -613,8 +688,8 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_cache_available_percent, &sysfs_average_key_size, - &sysfs_dirty_data, + &sysfs_errors, &sysfs_io_error_limit, &sysfs_io_error_halflife, &sysfs_congested, @@ -632,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { sysfs_time_stats_attribute_list(btree_split, sec, us) sysfs_time_stats_attribute_list(btree_sort, ms, us) sysfs_time_stats_attribute_list(btree_read, ms, us) - sysfs_time_stats_attribute_list(try_harder, ms, us) &sysfs_btree_nodes, &sysfs_btree_used_percent, @@ -648,6 +722,7 @@ static struct attribute *bch_cache_set_internal_files[] = { #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, #endif &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, @@ -674,9 +749,6 @@ SHOW(__bch_cache) sysfs_print(io_errors, atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); - sysfs_print(freelist_percent, ca->free.size * 100 / - ((size_t) ca->sb.nbuckets)); - if (attr == &sysfs_cache_replacement_policy) return bch_snprint_string_list(buf, PAGE_SIZE, cache_replacement_policies, @@ -686,7 +758,9 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - size_t n = ca->sb.nbuckets, i, unused, btree; + struct bucket *b; + size_t n = ca->sb.nbuckets, i; + size_t unused = 0, available = 0, dirty = 0, meta = 0; uint64_t sum = 0; /* Compute 31 quantiles */ uint16_t q[31], *p, *cached; @@ -697,6 +771,17 @@ SHOW(__bch_cache) return -ENOMEM; mutex_lock(&ca->set->bucket_lock); + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + unused++; + if (GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + if (GC_MARK(b) == GC_MARK_DIRTY) + dirty++; + if (GC_MARK(b) == GC_MARK_METADATA) + meta++; + } + for (i = ca->sb.first_bucket; i < n; i++) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); @@ -711,10 +796,7 @@ SHOW(__bch_cache) while (cached < p + n && *cached == BTREE_PRIO) - cached++; - - btree = cached - p; - n -= btree; + cached++, n--; for (i = 0; i < n; i++) sum += INITIAL_PRIO - cached[i]; @@ -730,12 +812,16 @@ SHOW(__bch_cache) ret = scnprintf(buf, PAGE_SIZE, "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" "Metadata: %zu%%\n" "Average: %llu\n" "Sectors per Q: %zu\n" "Quantiles: [", unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) @@ -783,32 +869,6 @@ STORE(__bch_cache) } } - if (attr == &sysfs_freelist_percent) { - DECLARE_FIFO(long, free); - long i; - size_t p = strtoul_or_return(buf); - - p = clamp_t(size_t, - ((size_t) ca->sb.nbuckets * p) / 100, - roundup_pow_of_two(ca->sb.nbuckets) >> 9, - ca->sb.nbuckets / 2); - - if (!init_fifo_exact(&free, p, GFP_KERNEL)) - return -ENOMEM; - - mutex_lock(&ca->set->bucket_lock); - - fifo_move(&free, &ca->free); - fifo_swap(&free, &ca->free); - - mutex_unlock(&ca->set->bucket_lock); - - while (fifo_pop(&free, i)) - atomic_dec(&ca->buckets[i].pin); - - free_fifo(&free); - } - if (attr == &sysfs_clear_stats) { atomic_long_set(&ca->sectors_written, 0); atomic_long_set(&ca->btree_sectors_written, 0); @@ -832,7 +892,6 @@ static struct attribute *bch_cache_files[] = { &sysfs_metadata_written, &sysfs_io_errors, &sysfs_clear_stats, - &sysfs_freelist_percent, &sysfs_cache_replacement_policy, NULL }; diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index f7b6c197f90..b7820b0d262 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -1,6 +1,5 @@ #include "bcache.h" #include "btree.h" -#include "request.h" #include <linux/blktrace_api.h> #include <linux/module.h> @@ -46,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 98eb81159a2..db3ae4c2b22 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid) void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) { - uint64_t now = local_clock(); - uint64_t duration = time_after64(now, start_time) + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) ? now - start_time : 0; - uint64_t last = time_after64(now, stats->last) + last = time_after64(now, stats->last) ? now - stats->last : 0; stats->max_duration = max(stats->max_duration, duration); @@ -188,13 +192,30 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) } stats->last = now ?: 1; + + spin_unlock(&stats->lock); } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done) +/** + * bch_next_delay() - increment @d by the amount of work done, and return how + * long to delay until the next time to do some work. + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + * + * Returns the amount of time to delay by, in jiffies + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; return time_after64(d->next, now) ? div_u64(d->next - now, NSEC_PER_SEC / HZ) @@ -203,10 +224,10 @@ unsigned bch_next_delay(struct ratelimit *d, uint64_t done) void bch_bio_map(struct bio *bio, void *base) { - size_t size = bio->bi_size; + size_t size = bio->bi_iter.bi_size; struct bio_vec *bv = bio->bi_io_vec; - BUG_ON(!bio->bi_size); + BUG_ON(!bio->bi_iter.bi_size); BUG_ON(bio->bi_vcnt); bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1ae2a73ad85..ac7d0d1f70d 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -2,6 +2,7 @@ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H +#include <linux/blkdev.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/llist.h> @@ -15,28 +16,20 @@ struct closure; -#ifdef CONFIG_BCACHE_EDEBUG +#ifdef CONFIG_BCACHE_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -#else /* EDEBUG */ +#else /* DEBUG */ +#define EBUG_ON(cond) do { if (cond); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) #endif -#define BITMASK(name, type, field, offset, size) \ -static inline uint64_t name(const type *k) \ -{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ - \ -static inline void SET_##name(type *k, uint64_t v) \ -{ \ - k->field &= ~(~((uint64_t) ~0 << size) << offset); \ - k->field |= v << offset; \ -} - #define DECLARE_HEAP(type, name) \ struct { \ size_t size, used; \ @@ -120,7 +113,7 @@ do { \ _r; \ }) -#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) #define heap_full(h) ((h)->used == (h)->size) @@ -388,6 +381,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[ ssize_t bch_read_string_list(const char *buf, const char * const list[]); struct time_stats { + spinlock_t lock; /* * all fields are in nanoseconds, averages are ewmas stored left shifted * by 8 @@ -400,6 +394,11 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + #define NSEC_PER_ns 1L #define NSEC_PER_us NSEC_PER_USEC #define NSEC_PER_ms NSEC_PER_MSEC @@ -450,17 +449,23 @@ read_attribute(name ## _last_ ## frequency_units) (ewma) >> factor; \ }) -struct ratelimit { +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ uint64_t next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to bch_next_delay() + */ unsigned rate; }; -static inline void ratelimit_reset(struct ratelimit *d) +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) { d->next = local_clock(); } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done); +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); #define __DIV_SAFE(n, d, zero) \ ({ \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 22cbff55162..f4300e4c011 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -11,18 +11,11 @@ #include "debug.h" #include "writeback.h" +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> #include <trace/events/bcache.h> -static struct workqueue_struct *dirty_wq; - -static void read_dirty(struct closure *); - -struct dirty_io { - struct closure cl; - struct cached_dev *dc; - struct bio bio; -}; - /* Rate limiting */ static void __update_writeback_rate(struct cached_dev *dc) @@ -37,44 +30,43 @@ static void __update_writeback_rate(struct cached_dev *dc) /* PD controller */ - int change = 0; - int64_t error; int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; dc->disk.sectors_dirty_last = dirty; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + /* Scale to sectors per second */ - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - /* Avoid divide by zero */ - if (!target) - goto out; + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - error = div64_s64((dirty + derivative - target) << 8, target); + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); + + change = proportional + derivative; /* Don't increase writeback rate if the device isn't keeping up */ if (change > 0 && time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + dc->writeback_rate.next + NSEC_PER_MSEC)) change = 0; dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 1, NSEC_PER_MSEC); -out: + + dc->writeback_rate_proportional = proportional; dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); } static void update_writeback_rate(struct work_struct *work) @@ -90,48 +82,25 @@ static void update_writeback_rate(struct work_struct *work) __update_writeback_rate(dc); up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); } static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + return bch_next_delay(&dc->writeback_rate, sectors); } -/* Background writeback */ - -static bool dirty_pred(struct keybuf *buf, struct bkey *k) -{ - return KEY_DIRTY(k); -} - -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) -{ - uint64_t stripe; - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); - unsigned stripe_size = 1 << dc->disk.stripe_size_bits; - - if (!KEY_DIRTY(k)) - return false; - - stripe = KEY_START(k) >> dc->disk.stripe_size_bits; - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != - stripe_size) - return false; - - if (nr_sectors <= stripe_size) - return true; - - nr_sectors -= stripe_size; - stripe++; - } -} +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; static void dirty_init(struct keybuf_key *w) { @@ -142,138 +111,13 @@ static void dirty_init(struct keybuf_key *w) if (!io->dc->writeback_percent) bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - bio->bi_size = KEY_SIZE(&w->key) << 9; + bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); bio->bi_private = w; bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } -static void refill_dirty(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; - struct bkey end = MAX_KEY; - SET_KEY_INODE(&end, dc->disk.id); - - if (!atomic_read(&dc->disk.detaching) && - !dc->writeback_running) - closure_return(cl); - - down_write(&dc->writeback_lock); - - if (!atomic_read(&dc->has_dirty)) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - - up_write(&dc->writeback_lock); - closure_return(cl); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - 1 << dc->disk.stripe_size_bits) - goto full_stripes; - - goto normal_refill; -full_stripes: - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { - /* Searched the entire btree - delay awhile */ - - if (RB_EMPTY_ROOT(&buf->keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); - } - - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - } - - up_write(&dc->writeback_lock); - - ratelimit_reset(&dc->writeback_rate); - - /* Punt to workqueue only so we don't recurse and blow the stack */ - continue_at(cl, read_dirty, dirty_wq); -} - -void bch_writeback_queue(struct cached_dev *dc) -{ - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); - } -} - -void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); - - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - - if (dc->writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -} - -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, - uint64_t offset, int nr_sectors) -{ - struct bcache_device *d = c->devices[inode]; - unsigned stripe_size, stripe_offset; - uint64_t stripe; - - if (!d) - return; - - stripe_size = 1 << d->stripe_size_bits; - stripe = offset >> d->stripe_size_bits; - stripe_offset = offset & (stripe_size - 1); - - while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), - stripe_size - stripe_offset); - - if (nr_sectors < 0) - s = -s; - - atomic_add(s, d->stripe_sectors_dirty + stripe); - nr_sectors -= s; - stripe_offset = 0; - stripe++; - } -} - -/* Background writeback - IO loop */ - static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); @@ -293,34 +137,31 @@ static void write_dirty_finish(struct closure *cl) /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { + int ret; unsigned i; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct keylist keys; - op.type = BTREE_REPLACE; - bkey_copy(&op.replace, &w->key); + bch_keylist_init(&keys); - SET_KEY_DIRTY(&w->key, false); - bch_keylist_add(&op.keys, &w->key); + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - bch_btree_insert(&op, dc->disk.c); - closure_sync(&op.cl); + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); - if (op.insert_collision) + if (ret) trace_bcache_writeback_collision(&w->key); - atomic_long_inc(op.insert_collision + atomic_long_inc(ret ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); } bch_keybuf_del(&dc->writeback_keys, w); - atomic_dec_bug(&dc->in_flight); - - closure_wake_up(&dc->writeback_wait); + up(&dc->in_flight); closure_return_with_destructor(cl, dirty_io_destructor); } @@ -343,13 +184,13 @@ static void write_dirty(struct closure *cl) dirty_init(w); io->bio.bi_rw = WRITE; - io->bio.bi_sector = KEY_START(&w->key); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty_finish, dirty_wq); + continue_at(cl, write_dirty_finish, system_wq); } static void read_dirty_endio(struct bio *bio, int error) @@ -369,37 +210,36 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty, dirty_wq); + continue_at(cl, write_dirty, system_wq); } -static void read_dirty(struct closure *cl) +static void read_dirty(struct cached_dev *dc) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - unsigned delay = writeback_delay(dc, 0); + unsigned delay = 0; struct keybuf_key *w; struct dirty_io *io; + struct closure cl; + + closure_init_stack(&cl); /* * XXX: if we error, background writeback just spins. Should use some * mempools. */ - while (1) { + while (!kthread_should_stop()) { + try_to_freeze(); + w = bch_keybuf_next(&dc->writeback_keys); if (!w) break; BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - if (delay > 0 && - (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) { - w->private = NULL; - - closure_delay(&dc->writeback, delay); - continue_at(cl, read_dirty, dirty_wq); - } + if (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50) + while (!kthread_should_stop() && delay) + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -413,7 +253,7 @@ static void read_dirty(struct closure *cl) io->dc = dc; dirty_init(w); - io->bio.bi_sector = PTR_OFFSET(&w->key, 0); + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); io->bio.bi_bdev = PTR_CACHE(dc->disk.c, &w->key, 0)->bdev; io->bio.bi_rw = READ; @@ -424,15 +264,10 @@ static void read_dirty(struct closure *cl) trace_bcache_writeback(&w->key); - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + down(&dc->in_flight); + closure_call(&io->cl, read_dirty_submit, NULL, &cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); - - atomic_inc(&dc->in_flight); - - if (!closure_wait_event(&dc->writeback_wait, cl, - atomic_read(&dc->in_flight) < 64)) - continue_at(cl, read_dirty, dirty_wq); } if (0) { @@ -442,51 +277,211 @@ err: bch_keybuf_del(&dc->writeback_keys, w); } - refill_dirty(cl); + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + closure_sync(&cl); } -/* Init */ +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_offset, stripe, sectors_dirty; + + if (!d) + return; + + stripe = offset_to_stripe(d, offset); + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + return KEY_DIRTY(k); +} + +static void refill_full_stripes(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); + + if (stripe >= dc->disk.nr_stripes) + stripe = 0; + + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; -static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, - struct cached_dev *dc) + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } + } +} + +static bool refill_dirty(struct cached_dev *dc) { - struct bkey *k; - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) - if (!b->level) { - if (KEY_INODE(k) > dc->disk.id) - break; - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, dc->disk.id, - KEY_START(k), - KEY_SIZE(k)); - } else { - btree(sectors_dirty_init, k, b, op, dc); - if (KEY_INODE(k) > dc->disk.id) - break; - - cond_resched(); + struct keybuf *buf = &dc->writeback_keys; + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + bool searched_from_start = false; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + bool searched_full_index; + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + if (!atomic_read(&dc->has_dirty) || + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + try_to_freeze(); + schedule(); + continue; + } + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); } + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_rate); + read_dirty(dc); + + if (searched_full_index) { + unsigned delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_uninterruptible(delay); + } + } + return 0; } +/* Init */ + +struct sectors_dirty_init { + struct btree_op op; + unsigned inode; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + struct bkey *k) +{ + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); + if (KEY_INODE(k) > op->inode) + return MAP_DONE; + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + return MAP_CONTINUE; +} + void bch_sectors_dirty_init(struct cached_dev *dc) { - struct btree_op op; + struct sectors_dirty_init op; + + bch_btree_op_init(&op.op, -1); + op.inode = dc->disk.id; - bch_btree_op_init_stack(&op); - btree_root(sectors_dirty_init, dc->disk.c, &op, dc); + bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } -void bch_cached_dev_writeback_init(struct cached_dev *dc) +int bch_cached_dev_writeback_init(struct cached_dev *dc) { - closure_init_unlocked(&dc->writeback); + sema_init(&dc->in_flight, 64); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; @@ -495,27 +490,18 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; + + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -} - -void bch_writeback_exit(void) -{ - if (dirty_wq) - destroy_workqueue(dirty_wq); -} - -int __init bch_writeback_init(void) -{ - dirty_wq = create_singlethread_workqueue("bcache_writeback"); - if (!dirty_wq) - return -ENOMEM; return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c91f61bb95b..e2f8598937a 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,20 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, +static inline unsigned offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset >> d->stripe_size_bits; + unsigned stripe = offset_to_stripe(&dc->disk, offset); while (1) { - if (atomic_read(d->stripe_sectors_dirty + stripe)) + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= 1 << d->stripe_size_bits) + if (nr_sectors <= dc->disk.stripe_size) return false; - nr_sectors -= 1 << d->stripe_size_bits; + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -38,12 +45,12 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || - atomic_read(&dc->disk.detaching) || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || in_use > CUTOFF_WRITEBACK_SYNC) return false; if (dc->partial_stripes_expensive && - bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, bio_sectors(bio))) return true; @@ -54,11 +61,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use <= CUTOFF_WRITEBACK; } +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *); void bch_sectors_dirty_init(struct cached_dev *dc); -void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_init(struct cached_dev *); #endif |
