aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/bcache
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/Kconfig8
-rw-r--r--drivers/md/bcache/Makefile5
-rw-r--r--drivers/md/bcache/alloc.c190
-rw-r--r--drivers/md/bcache/bcache.h115
-rw-r--r--drivers/md/bcache/bset.c919
-rw-r--r--drivers/md/bcache/bset.h446
-rw-r--r--drivers/md/bcache/btree.c1125
-rw-r--r--drivers/md/bcache/btree.h83
-rw-r--r--drivers/md/bcache/closure.h2
-rw-r--r--drivers/md/bcache/debug.c240
-rw-r--r--drivers/md/bcache/debug.h27
-rw-r--r--drivers/md/bcache/extents.c620
-rw-r--r--drivers/md/bcache/extents.h13
-rw-r--r--drivers/md/bcache/journal.c61
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c18
-rw-r--r--drivers/md/bcache/request.c230
-rw-r--r--drivers/md/bcache/request.h19
-rw-r--r--drivers/md/bcache/stats.c3
-rw-r--r--drivers/md/bcache/super.c83
-rw-r--r--drivers/md/bcache/sysfs.c193
-rw-r--r--drivers/md/bcache/trace.c2
-rw-r--r--drivers/md/bcache/util.h8
23 files changed, 2400 insertions, 2011 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 2638417b19a..4d200883c50 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
Keeps all active closures in a linked list and provides a debugfs
interface to list them, which makes it possible to see asynchronous
operations that get stuck.
-
-# cgroup code needs to be updated:
-#
-#config CGROUP_BCACHE
-# bool "Cgroup controls for bcache"
-# depends on BCACHE && BLK_CGROUP
-# ---help---
-# TODO
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be..c488b846f83 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
obj-$(CONFIG_BCACHE) += bcache.o
-bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\
- movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o
CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index bcfd96e2121..443d03fbac4 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
- if (CACHE_SYNC(&ca->set->sb)) {
- ca->need_save_prio = max(ca->need_save_prio,
- bucket_disk_gen(b));
- WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
- }
-
return ret;
}
@@ -120,56 +114,63 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
mutex_unlock(&c->bucket_lock);
}
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
static inline bool can_inc_bucket_gen(struct bucket *b)
{
- return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
- bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+ return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
}
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
- BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-
- if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
- unsigned i;
-
- for (i = 0; i < RESERVE_NONE; i++)
- if (!fifo_full(&ca->free[i]))
- goto add;
-
- return false;
- }
-add:
- b->prio = 0;
-
- if (can_inc_bucket_gen(b) &&
- fifo_push(&ca->unused, b - ca->buckets)) {
- atomic_inc(&b->pin);
- return true;
- }
-
- return false;
-}
+ BUG_ON(!ca->set->gc_mark_valid);
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
- return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+ return (!GC_MARK(b) ||
+ GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
!atomic_read(&b->pin) &&
can_inc_bucket_gen(b);
}
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{
+ lockdep_assert_held(&ca->set->bucket_lock);
+ BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+ if (GC_SECTORS_USED(b))
+ trace_bcache_invalidate(ca, b - ca->buckets);
+
bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO;
atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+ __bch_invalidate_one_bucket(ca, b);
+
fifo_push(&ca->free_inc, b - ca->buckets);
}
-#define bucket_prio(b) \
- (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+#define bucket_prio(b) \
+({ \
+ unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
+ \
+ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
+})
#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
@@ -182,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
ca->heap.used = 0;
for_each_bucket(b, ca) {
- /*
- * If we fill up the unused list, if we then return before
- * adding anything to the free_inc list we'll skip writing
- * prios/gens and just go back to allocating from the unused
- * list:
- */
- if (fifo_full(&ca->unused))
- return;
-
- if (!can_invalidate_bucket(ca, b))
- continue;
-
- if (!GC_SECTORS_USED(b) &&
- bch_bucket_add_unused(ca, b))
+ if (!bch_can_invalidate_bucket(ca, b))
continue;
if (!heap_full(&ca->heap))
@@ -220,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
return;
}
- invalidate_one_bucket(ca, b);
+ bch_invalidate_one_bucket(ca, b);
}
}
@@ -236,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
b = ca->buckets + ca->fifo_last_bucket++;
- if (can_invalidate_bucket(ca, b))
- invalidate_one_bucket(ca, b);
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets) {
ca->invalidate_needs_gc = 1;
@@ -261,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
b = ca->buckets + n;
- if (can_invalidate_bucket(ca, b))
- invalidate_one_bucket(ca, b);
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets / 2) {
ca->invalidate_needs_gc = 1;
@@ -274,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
static void invalidate_buckets(struct cache *ca)
{
- if (ca->invalidate_needs_gc)
- return;
+ BUG_ON(ca->invalidate_needs_gc);
switch (CACHE_REPLACEMENT(&ca->sb)) {
case CACHE_REPLACEMENT_LRU:
@@ -288,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
invalidate_buckets_random(ca);
break;
}
-
- trace_bcache_alloc_invalidate(ca);
}
#define allocator_wait(ca, cond) \
@@ -337,17 +322,10 @@ static int bch_allocator_thread(void *arg)
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (1) {
+ while (!fifo_empty(&ca->free_inc)) {
long bucket;
- if ((!atomic_read(&ca->set->prio_blocked) ||
- !CACHE_SYNC(&ca->set->sb)) &&
- !fifo_empty(&ca->unused))
- fifo_pop(&ca->unused, bucket);
- else if (!fifo_empty(&ca->free_inc))
- fifo_pop(&ca->free_inc, bucket);
- else
- break;
+ fifo_pop(&ca->free_inc, bucket);
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
@@ -358,6 +336,7 @@ static int bch_allocator_thread(void *arg)
}
allocator_wait(ca, bch_allocator_push(ca, bucket));
+ wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait);
}
@@ -367,9 +346,9 @@ static int bch_allocator_thread(void *arg)
* them to the free_inc list:
*/
+retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid &&
- (ca->need_save_prio > 64 ||
- !ca->invalidate_needs_gc));
+ !ca->invalidate_needs_gc);
invalidate_buckets(ca);
/*
@@ -377,13 +356,28 @@ static int bch_allocator_thread(void *arg)
* new stuff to them:
*/
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
- if (CACHE_SYNC(&ca->set->sb) &&
- (!fifo_empty(&ca->free_inc) ||
- ca->need_save_prio > 64))
+ if (CACHE_SYNC(&ca->set->sb)) {
+ /*
+ * This could deadlock if an allocation with a btree
+ * node locked ever blocked - having the btree node
+ * locked would block garbage collection, but here we're
+ * waiting on garbage collection before we invalidate
+ * and free anything.
+ *
+ * But this should be safe since the btree code always
+ * uses btree_check_reserve() before allocating now, and
+ * if it fails it blocks without btree nodes locked.
+ */
+ if (!fifo_full(&ca->free_inc))
+ goto retry_invalidate;
+
bch_prio_write(ca);
+ }
}
}
+/* Allocation */
+
long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
{
DEFINE_WAIT(w);
@@ -395,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
fifo_pop(&ca->free[reserve], r))
goto out;
- if (!wait)
+ if (!wait) {
+ trace_bcache_alloc_fail(ca, reserve);
return -1;
+ }
do {
prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -412,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
out:
wake_up_process(ca->alloc_thread);
+ trace_bcache_alloc(ca, reserve);
+
if (expensive_debug_checks(ca->set)) {
size_t iter;
long i;
@@ -425,8 +423,6 @@ out:
BUG_ON(i == r);
fifo_for_each(i, &ca->free_inc, iter)
BUG_ON(i == r);
- fifo_for_each(i, &ca->unused, iter)
- BUG_ON(i == r);
}
b = ca->buckets + r;
@@ -448,17 +444,19 @@ out:
return r;
}
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+ SET_GC_MARK(b, 0);
+ SET_GC_SECTORS_USED(b, 0);
+}
+
void bch_bucket_free(struct cache_set *c, struct bkey *k)
{
unsigned i;
- for (i = 0; i < KEY_PTRS(k); i++) {
- struct bucket *b = PTR_BUCKET(c, k, i);
-
- SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
- SET_GC_SECTORS_USED(b, 0);
- bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
- }
+ for (i = 0; i < KEY_PTRS(k); i++)
+ __bch_bucket_free(PTR_CACHE(c, k, i),
+ PTR_BUCKET(c, k, i));
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -696,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
ca->alloc_thread = k;
return 0;
}
-
-int bch_cache_allocator_init(struct cache *ca)
-{
- /*
- * Reserve:
- * Prio/gen writes first
- * Then 8 for btree allocations
- * Then half for the moving garbage collector
- */
-#if 0
- ca->watermark[WATERMARK_PRIO] = 0;
-
- ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
-
- ca->watermark[WATERMARK_MOVINGGC] = 8 +
- ca->watermark[WATERMARK_METADATA];
-
- ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
- ca->watermark[WATERMARK_MOVINGGC];
-#endif
- return 0;
-}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d955a493461..d2ebcf32309 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bset.h"
#include "util.h"
#include "closure.h"
@@ -194,9 +195,7 @@ struct bucket {
atomic_t pin;
uint16_t prio;
uint8_t gen;
- uint8_t disk_gen;
uint8_t last_gc; /* Most out of date gen in the btree */
- uint8_t gc_gen;
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
};
@@ -206,10 +205,12 @@ struct bucket {
*/
BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
-#define GC_MARK_RECLAIMABLE 0
-#define GC_MARK_DIRTY 1
-#define GC_MARK_METADATA 2
-BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
+#define GC_MARK_RECLAIMABLE 1
+#define GC_MARK_DIRTY 2
+#define GC_MARK_METADATA 3
+#define GC_SECTORS_USED_SIZE 13
+#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
#include "journal.h"
@@ -423,14 +424,9 @@ struct cache {
* their new gen to disk. After prio_write() finishes writing the new
* gens/prios, they'll be moved to the free list (and possibly discarded
* in the process)
- *
- * unused: GC found nothing pointing into these buckets (possibly
- * because all the data they contained was overwritten), so we only
- * need to discard them before they can be moved to the free list.
*/
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
- DECLARE_FIFO(long, unused);
size_t fifo_last_bucket;
@@ -440,12 +436,6 @@ struct cache {
DECLARE_HEAP(struct bucket *, heap);
/*
- * max(gen - disk_gen) for all buckets. When it gets too big we have to
- * call prio_write() to keep gens from wrapping.
- */
- uint8_t need_save_prio;
-
- /*
* If nonzero, we know we aren't going to find any buckets to invalidate
* until a gc finishes - otherwise we could pointlessly burn a ton of
* cpu
@@ -559,19 +549,16 @@ struct cache_set {
struct list_head btree_cache_freed;
/* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned bucket_cache_used;
+ unsigned btree_cache_used;
/*
* If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation. However, only one thread can be doing this
- * at a time, for obvious reasons - try_harder and try_wait are
- * basically a lock for this that we can wait on asynchronously. The
- * btree_root() macro releases the lock when it returns.
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
*/
- struct task_struct *try_harder;
- wait_queue_head_t try_wait;
- uint64_t try_harder_start;
+ wait_queue_head_t btree_cache_wait;
+ struct task_struct *btree_cache_alloc_lock;
/*
* When we free a btree node, we increment the gen of the bucket the
@@ -600,7 +587,7 @@ struct cache_set {
uint16_t min_prio;
/*
- * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+ * max(gen - last_gc) for all buckets. When it gets too big we have to gc
* to keep gens from wrapping around.
*/
uint8_t need_gc;
@@ -625,10 +612,13 @@ struct cache_set {
/* Number of moving GC bios in flight */
struct semaphore moving_in_flight;
+ struct workqueue_struct *moving_gc_wq;
+
struct btree *root;
#ifdef CONFIG_BCACHE_DEBUG
struct btree *verify_data;
+ struct bset *verify_ondisk;
struct mutex verify_lock;
#endif
@@ -644,13 +634,7 @@ struct cache_set {
*/
mempool_t *fill_iter;
- /*
- * btree_sort() is a merge sort and requires temporary space - single
- * element mempool
- */
- struct mutex sort_lock;
- struct bset *sort;
- unsigned sort_crit_factor;
+ struct bset_sort_state sort;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
@@ -666,11 +650,9 @@ struct cache_set {
unsigned congested_read_threshold_us;
unsigned congested_write_threshold_us;
- struct time_stats sort_time;
struct time_stats btree_gc_time;
struct time_stats btree_split_time;
struct time_stats btree_read_time;
- struct time_stats try_harder_time;
atomic_long_t cache_read_races;
atomic_long_t writeback_keys_done;
@@ -684,9 +666,9 @@ struct cache_set {
unsigned error_decay;
unsigned short journal_delay_ms;
+ bool expensive_debug_checks;
unsigned verify:1;
unsigned key_merging_disabled:1;
- unsigned expensive_debug_checks:1;
unsigned gc_always_rewrite:1;
unsigned shrinker_disabled:1;
unsigned copy_gc_enabled:1;
@@ -708,13 +690,8 @@ struct bbio {
struct bio bio;
};
-static inline unsigned local_clock_us(void)
-{
- return local_clock() >> 10;
-}
-
#define BTREE_PRIO USHRT_MAX
-#define INITIAL_PRIO 32768
+#define INITIAL_PRIO 32768U
#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
#define btree_blocks(b) \
@@ -727,17 +704,6 @@ static inline unsigned local_clock_us(void)
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
#define block_bytes(c) ((c)->sb.block_size << 9)
-#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i) __set_bytes(i, i->keys)
-
-#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
-
-#define node(i, j) ((struct bkey *) ((i)->d + (j)))
-#define end(i) node(i, (i)->keys)
-
-#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
-
#define prios_per_bucket(c) \
((bucket_bytes(c) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
@@ -780,20 +746,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
}
-/* Btree key macros */
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+ uint8_t r = a - b;
+ return r > 128U ? 0 : r;
+}
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+ unsigned i)
{
- *k = ZERO_KEY;
+ return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
}
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
/*
* This is used for various on disk data structures - cache_sb, prio_set, bset,
* jset: The checksum is _always_ the first 8 bytes of these structs
*/
#define csum_set(i) \
bch_crc64(((void *) (i)) + sizeof(uint64_t), \
- ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+ ((void *) bset_bkey_last(i)) - \
+ (((void *) (i)) + sizeof(uint64_t)))
/* Error handling macros */
@@ -848,16 +828,13 @@ static inline bool cached_dev_get(struct cached_dev *dc)
return false;
/* Paired with the mb in cached_dev_attach */
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
return true;
}
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
*/
static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -865,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
return b->gen - b->last_gc;
}
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
- return b->gen - b->disk_gen;
-}
-
#define BUCKET_GC_GEN_MAX 96U
-#define BUCKET_DISK_GEN_MAX 64U
#define kobj_attribute_write(n, fn) \
static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -904,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
void bch_bucket_free(struct cache_set *, struct bkey *);
+long bch_bucket_alloc(struct cache *, unsigned, bool);
int __bch_bucket_alloc_set(struct cache_set *, unsigned,
struct bkey *, int, bool);
int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -959,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
-int bch_cache_allocator_init(struct cache *ca);
void bch_debug_exit(void);
int bch_debug_init(struct kobject *);
void bch_request_exit(void);
int bch_request_init(void);
-void bch_btree_exit(void);
-int bch_btree_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index e51a739f751..54541641530 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
* Copyright 2012 Google, Inc.
*/
-#include "bcache.h"
-#include "btree.h"
-#include "debug.h"
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#include "util.h"
+#include "bset.h"
+
+#include <linux/console.h>
#include <linux/random.h>
#include <linux/prefetch.h>
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+ struct bkey *k, *next;
+
+ for (k = i->start; k < bset_bkey_last(i); k = next) {
+ next = bkey_next(k);
+
+ printk(KERN_ERR "block %u key %u/%u: ", set,
+ (unsigned) ((u64 *) k - i->d), i->keys);
+
+ if (b->ops->key_dump)
+ b->ops->key_dump(b, k);
+ else
+ printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+
+ if (next < bset_bkey_last(i) &&
+ bkey_cmp(k, b->ops->is_extents ?
+ &START_KEY(next) : next) > 0)
+ printk(KERN_ERR "Key skipped backwards\n");
+ }
+}
+
+void bch_dump_bucket(struct btree_keys *b)
+{
+ unsigned i;
+
+ console_lock();
+ for (i = 0; i <= b->nsets; i++)
+ bch_dump_bset(b, b->set[i].data,
+ bset_sector_offset(b, b->set[i].data));
+ console_unlock();
+}
+
+int __bch_count_data(struct btree_keys *b)
+{
+ unsigned ret = 0;
+ struct btree_iter iter;
+ struct bkey *k;
+
+ if (b->ops->is_extents)
+ for_each_key(b, k, &iter)
+ ret += KEY_SIZE(k);
+ return ret;
+}
+
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+ va_list args;
+ struct bkey *k, *p = NULL;
+ struct btree_iter iter;
+ const char *err;
+
+ for_each_key(b, k, &iter) {
+ if (b->ops->is_extents) {
+ err = "Keys out of order";
+ if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+ goto bug;
+
+ if (bch_ptr_invalid(b, k))
+ continue;
+
+ err = "Overlapping keys";
+ if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+ goto bug;
+ } else {
+ if (bch_ptr_bad(b, k))
+ continue;
+
+ err = "Duplicate keys";
+ if (p && !bkey_cmp(p, k))
+ goto bug;
+ }
+ p = k;
+ }
+#if 0
+ err = "Key larger than btree node key";
+ if (p && bkey_cmp(p, &b->key) > 0)
+ goto bug;
+#endif
+ return;
+bug:
+ bch_dump_bucket(b);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ panic("bch_check_keys error: %s:\n", err);
+}
+
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+ struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+ if (next < iter->data->end &&
+ bkey_cmp(k, iter->b->ops->is_extents ?
+ &START_KEY(next) : next) > 0) {
+ bch_dump_bucket(iter->b);
+ panic("Key skipped backwards\n");
+ }
+}
+
+#else
+
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#endif
+
/* Keylists */
-int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
{
size_t oldsize = bch_keylist_nkeys(l);
- size_t newsize = oldsize + 2 + nptrs;
+ size_t newsize = oldsize + u64s;
uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
uint64_t *new_keys;
- /* The journalling code doesn't handle the case where the keys to insert
- * is bigger than an empty write: If we just return -ENOMEM here,
- * bio_insert() and bio_invalidate() will insert the keys created so far
- * and finish the rest when the keylist is empty.
- */
- if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
- return -ENOMEM;
-
newsize = roundup_pow_of_two(newsize);
if (newsize <= KEYLIST_INLINE ||
@@ -71,140 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
bch_keylist_bytes(l));
}
-/* Pointer validation */
-
-static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- unsigned i;
-
- for (i = 0; i < KEY_PTRS(k); i++)
- if (ptr_available(c, k, i)) {
- struct cache *ca = PTR_CACHE(c, k, i);
- size_t bucket = PTR_BUCKET_NR(c, k, i);
- size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
- if (KEY_SIZE(k) + r > c->sb.bucket_size ||
- bucket < ca->sb.first_bucket ||
- bucket >= ca->sb.nbuckets)
- return true;
- }
-
- return false;
-}
-
-bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- char buf[80];
-
- if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
- goto bad;
-
- if (__ptr_invalid(c, k))
- goto bad;
-
- return false;
-bad:
- bch_bkey_to_text(buf, sizeof(buf), k);
- cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
- return true;
-}
-
-bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- char buf[80];
-
- if (!KEY_SIZE(k))
- return true;
-
- if (KEY_SIZE(k) > KEY_OFFSET(k))
- goto bad;
-
- if (__ptr_invalid(c, k))
- goto bad;
-
- return false;
-bad:
- bch_bkey_to_text(buf, sizeof(buf), k);
- cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
- return true;
-}
-
-static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
- unsigned ptr)
-{
- struct bucket *g = PTR_BUCKET(b->c, k, ptr);
- char buf[80];
-
- if (mutex_trylock(&b->c->bucket_lock)) {
- if (b->level) {
- if (KEY_DIRTY(k) ||
- g->prio != BTREE_PRIO ||
- (b->c->gc_mark_valid &&
- GC_MARK(g) != GC_MARK_METADATA))
- goto err;
-
- } else {
- if (g->prio == BTREE_PRIO)
- goto err;
-
- if (KEY_DIRTY(k) &&
- b->c->gc_mark_valid &&
- GC_MARK(g) != GC_MARK_DIRTY)
- goto err;
- }
- mutex_unlock(&b->c->bucket_lock);
- }
-
- return false;
-err:
- mutex_unlock(&b->c->bucket_lock);
- bch_bkey_to_text(buf, sizeof(buf), k);
- btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
- buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
- g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
- return true;
-}
-
-bool bch_ptr_bad(struct btree *b, const struct bkey *k)
-{
- struct bucket *g;
- unsigned i, stale;
-
- if (!bkey_cmp(k, &ZERO_KEY) ||
- !KEY_PTRS(k) ||
- bch_ptr_invalid(b, k))
- return true;
-
- for (i = 0; i < KEY_PTRS(k); i++)
- if (!ptr_available(b->c, k, i))
- return true;
-
- if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
- return false;
-
- for (i = 0; i < KEY_PTRS(k); i++) {
- g = PTR_BUCKET(b->c, k, i);
- stale = ptr_stale(b->c, k, i);
-
- btree_bug_on(stale > 96, b,
- "key too stale: %i, need_gc %u",
- stale, b->c->need_gc);
-
- btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
- b, "stale dirty pointer");
-
- if (stale)
- return true;
-
- if (expensive_debug_checks(b->c) &&
- ptr_bad_expensive_checks(b, k, i))
- return true;
- }
-
- return false;
-}
-
/* Key/pointer manipulation */
void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -259,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
return true;
}
-static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS 3
+#define BKEY_EXPONENT_BITS 7
+#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+ unsigned exponent:BKEY_EXPONENT_BITS;
+ unsigned m:BKEY_MID_BITS;
+ unsigned mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE 128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
{
- return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
- ~((uint64_t)1 << 63);
+ return PAGE_SIZE << b->page_order;
}
-/* Tries to merge l and r: l should be lower than r
- * Returns true if we were able to merge. If we did merge, l will be the merged
- * key, r will be untouched.
- */
-bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
{
- unsigned i;
+ return btree_keys_bytes(b) / BSET_CACHELINE;
+}
- if (key_merging_disabled(b->c))
- return false;
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
- if (KEY_PTRS(l) != KEY_PTRS(r) ||
- KEY_DIRTY(l) != KEY_DIRTY(r) ||
- bkey_cmp(l, &START_KEY(r)))
- return false;
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
- for (i = 0; i < KEY_PTRS(l); i++)
- if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
- PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
- return false;
+/* Memory allocation */
- /* Keys with no pointers aren't restricted to one bucket and could
- * overflow KEY_SIZE
- */
- if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
- SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
- SET_KEY_SIZE(l, USHRT_MAX);
+void bch_btree_keys_free(struct btree_keys *b)
+{
+ struct bset_tree *t = b->set;
- bch_cut_front(l, r);
- return false;
- }
+ if (bset_prev_bytes(b) < PAGE_SIZE)
+ kfree(t->prev);
+ else
+ free_pages((unsigned long) t->prev,
+ get_order(bset_prev_bytes(b)));
- if (KEY_CSUM(l)) {
- if (KEY_CSUM(r))
- l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
- else
- SET_KEY_CSUM(l, 0);
- }
+ if (bset_tree_bytes(b) < PAGE_SIZE)
+ kfree(t->tree);
+ else
+ free_pages((unsigned long) t->tree,
+ get_order(bset_tree_bytes(b)));
- SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
- SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+ free_pages((unsigned long) t->data, b->page_order);
- return true;
+ t->prev = NULL;
+ t->tree = NULL;
+ t->data = NULL;
}
+EXPORT_SYMBOL(bch_btree_keys_free);
+
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+ struct bset_tree *t = b->set;
+
+ BUG_ON(t->data);
+
+ b->page_order = page_order;
+
+ t->data = (void *) __get_free_pages(gfp, b->page_order);
+ if (!t->data)
+ goto err;
+
+ t->tree = bset_tree_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_tree_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+ if (!t->tree)
+ goto err;
+
+ t->prev = bset_prev_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_prev_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+ if (!t->prev)
+ goto err;
+
+ return 0;
+err:
+ bch_btree_keys_free(b);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+ bool *expensive_debug_checks)
+{
+ unsigned i;
+
+ b->ops = ops;
+ b->expensive_debug_checks = expensive_debug_checks;
+ b->nsets = 0;
+ b->last_set_unwritten = 0;
+
+ /* XXX: shouldn't be needed */
+ for (i = 0; i < MAX_BSETS; i++)
+ b->set[i].size = 0;
+ /*
+ * Second loop starts at 1 because b->keys[0]->data is the memory we
+ * allocated
+ */
+ for (i = 1; i < MAX_BSETS; i++)
+ b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
@@ -459,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
return ((void *) k - (void *) t->data) / BSET_CACHELINE;
}
-static unsigned bkey_to_cacheline_offset(struct bkey *k)
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+ unsigned cacheline,
+ struct bkey *k)
{
- return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+ return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
}
static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -508,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
: tree_to_prev_bkey(t, j >> ffs(j));
struct bkey *r = is_power_of_2(j + 1)
- ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+ ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
: tree_to_bkey(t, j >> (ffz(j) + 1));
BUG_ON(m < l || m > r);
@@ -532,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
f->exponent = 127;
}
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
{
- if (t != b->sets) {
+ if (t != b->set) {
unsigned j = roundup(t[-1].size,
64 / sizeof(struct bkey_float));
@@ -542,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
t->prev = t[-1].prev + j;
}
- while (t < b->sets + MAX_BSETS)
+ while (t < b->set + MAX_BSETS)
t++->size = 0;
}
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
{
- struct bset_tree *t = b->sets + b->nsets;
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON(b->last_set_unwritten);
+ b->last_set_unwritten = 1;
bset_alloc_tree(b, t);
- if (t->tree != b->sets->tree + bset_tree_space(b)) {
- t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+ if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
+ t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
t->size = 1;
}
}
-static void bset_build_written_tree(struct btree *b)
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
{
- struct bset_tree *t = b->sets + b->nsets;
- struct bkey *k = t->data->start;
+ if (i != b->set->data) {
+ b->set[++b->nsets].data = i;
+ i->seq = b->set->data->seq;
+ } else
+ get_random_bytes(&i->seq, sizeof(uint64_t));
+
+ i->magic = magic;
+ i->version = 0;
+ i->keys = 0;
+
+ bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+
+void bch_bset_build_written_tree(struct btree_keys *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+ struct bkey *prev = NULL, *k = t->data->start;
unsigned j, cacheline = 1;
+ b->last_set_unwritten = 0;
+
bset_alloc_tree(b, t);
t->size = min_t(unsigned,
- bkey_to_cacheline(t, end(t->data)),
- b->sets->tree + bset_tree_space(b) - t->tree);
+ bkey_to_cacheline(t, bset_bkey_last(t->data)),
+ b->set->tree + btree_keys_cachelines(b) - t->tree);
if (t->size < 2) {
t->size = 0;
@@ -581,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
for (j = inorder_next(0, t->size);
j;
j = inorder_next(j, t->size)) {
- while (bkey_to_cacheline(t, k) != cacheline)
- k = bkey_next(k);
+ while (bkey_to_cacheline(t, k) < cacheline)
+ prev = k, k = bkey_next(k);
- t->prev[j] = bkey_u64s(k);
- k = bkey_next(k);
- cacheline++;
- t->tree[j].m = bkey_to_cacheline_offset(k);
+ t->prev[j] = bkey_u64s(prev);
+ t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
}
- while (bkey_next(k) != end(t->data))
+ while (bkey_next(k) != bset_bkey_last(t->data))
k = bkey_next(k);
t->end = *k;
@@ -601,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
j = inorder_next(j, t->size))
make_bfloat(t, j);
}
+EXPORT_SYMBOL(bch_bset_build_written_tree);
+
+/* Insert */
-void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
{
struct bset_tree *t;
unsigned inorder, j = 1;
- for (t = b->sets; t <= &b->sets[b->nsets]; t++)
- if (k < end(t->data))
+ for (t = b->set; t <= bset_tree_last(b); t++)
+ if (k < bset_bkey_last(t->data))
goto found_set;
BUG();
@@ -621,7 +697,7 @@ found_set:
if (k == t->data->start)
goto fix_left;
- if (bkey_next(k) == end(t->data)) {
+ if (bkey_next(k) == bset_bkey_last(t->data)) {
t->end = *k;
goto fix_right;
}
@@ -646,10 +722,12 @@ fix_right: do {
j = j * 2 + 1;
} while (j < t->size);
}
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
-void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+ struct bset_tree *t,
+ struct bkey *k)
{
- struct bset_tree *t = &b->sets[b->nsets];
unsigned shift = bkey_u64s(k);
unsigned j = bkey_to_cacheline(t, k);
@@ -661,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
* lookup table for the first key that is strictly greater than k:
* it's either k's cacheline or the next one
*/
- if (j < t->size &&
- table_to_bkey(t, j) <= k)
+ while (j < t->size &&
+ table_to_bkey(t, j) <= k)
j++;
/* Adjust all the lookup table entries, and find a new key for any that
@@ -677,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
while (k < cacheline_to_bkey(t, j, 0))
k = bkey_next(k);
- t->prev[j] = bkey_to_cacheline_offset(k);
+ t->prev[j] = bkey_to_cacheline_offset(t, j, k);
}
}
- if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+ if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
return;
/* Possibly add a new entry to the end of the lookup table */
for (k = table_to_bkey(t, t->size - 1);
- k != end(t->data);
+ k != bset_bkey_last(t->data);
k = bkey_next(k))
if (t->size == bkey_to_cacheline(t, k)) {
- t->prev[t->size] = bkey_to_cacheline_offset(k);
+ t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
t->size++;
}
}
-void bch_bset_init_next(struct btree *b)
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
{
- struct bset *i = write_block(b);
+ if (!b->ops->key_merge)
+ return false;
- if (i != b->sets[0].data) {
- b->sets[++b->nsets].data = i;
- i->seq = b->sets[0].data->seq;
- } else
- get_random_bytes(&i->seq, sizeof(uint64_t));
+ /*
+ * Generic header checks
+ * Assumes left and right are in order
+ * Left and right must be exactly aligned
+ */
+ if (!bch_bkey_equal_header(l, r) ||
+ bkey_cmp(l, &START_KEY(r)))
+ return false;
- i->magic = bset_magic(&b->c->sb);
- i->version = 0;
- i->keys = 0;
+ return b->ops->key_merge(b, l, r);
+}
+EXPORT_SYMBOL(bch_bkey_try_merge);
+
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+ struct bkey *insert)
+{
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON(!b->last_set_unwritten);
+ BUG_ON(bset_byte_offset(b, t->data) +
+ __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+ PAGE_SIZE << b->page_order);
+
+ memmove((uint64_t *) where + bkey_u64s(insert),
+ where,
+ (void *) bset_bkey_last(t->data) - (void *) where);
+
+ t->data->keys += bkey_u64s(insert);
+ bkey_copy(where, insert);
+ bch_bset_fix_lookup_table(b, t, where);
+}
+EXPORT_SYMBOL(bch_bset_insert);
+
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+ struct bkey *replace_key)
+{
+ unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+ struct bset *i = bset_tree_last(b)->data;
+ struct bkey *m, *prev = NULL;
+ struct btree_iter iter;
- bset_build_unwritten_tree(b);
+ BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+
+ m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+ ? PRECEDING_KEY(&START_KEY(k))
+ : PRECEDING_KEY(k));
+
+ if (b->ops->insert_fixup(b, k, &iter, replace_key))
+ return status;
+
+ status = BTREE_INSERT_STATUS_INSERT;
+
+ while (m != bset_bkey_last(i) &&
+ bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+ prev = m, m = bkey_next(m);
+
+ /* prev is in the tree, if we merge we're done */
+ status = BTREE_INSERT_STATUS_BACK_MERGE;
+ if (prev &&
+ bch_bkey_try_merge(b, prev, k))
+ goto merged;
+#if 0
+ status = BTREE_INSERT_STATUS_OVERWROTE;
+ if (m != bset_bkey_last(i) &&
+ KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+ goto copy;
+#endif
+ status = BTREE_INSERT_STATUS_FRONT_MERGE;
+ if (m != bset_bkey_last(i) &&
+ bch_bkey_try_merge(b, k, m))
+ goto copy;
+
+ bch_bset_insert(b, m, k);
+copy: bkey_copy(m, k);
+merged:
+ return status;
}
+EXPORT_SYMBOL(bch_btree_insert_key);
+
+/* Lookup */
struct bset_search_iter {
struct bkey *l, *r;
};
-static struct bset_search_iter bset_search_write_set(struct btree *b,
- struct bset_tree *t,
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
const struct bkey *search)
{
unsigned li = 0, ri = t->size;
- BUG_ON(!b->nsets &&
- t->size < bkey_to_cacheline(t, end(t->data)));
-
while (li + 1 != ri) {
unsigned m = (li + ri) >> 1;
@@ -736,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
return (struct bset_search_iter) {
table_to_bkey(t, li),
- ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+ ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
};
}
-static struct bset_search_iter bset_search_tree(struct btree *b,
- struct bset_tree *t,
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
const struct bkey *search)
{
struct bkey *l, *r;
@@ -788,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
f = &t->tree[inorder_next(j, t->size)];
r = cacheline_to_bkey(t, inorder, f->m);
} else
- r = end(t->data);
+ r = bset_bkey_last(t->data);
} else {
r = cacheline_to_bkey(t, inorder, f->m);
@@ -802,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
return (struct bset_search_iter) {l, r};
}
-struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
const struct bkey *search)
{
struct bset_search_iter i;
@@ -824,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
if (unlikely(!t->size)) {
i.l = t->data->start;
- i.r = end(t->data);
+ i.r = bset_bkey_last(t->data);
} else if (bset_written(b, t)) {
/*
* Each node in the auxiliary search tree covers a certain range
@@ -834,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
*/
if (unlikely(bkey_cmp(search, &t->end) >= 0))
- return end(t->data);
+ return bset_bkey_last(t->data);
if (unlikely(bkey_cmp(search, t->data->start) < 0))
return t->data->start;
- i = bset_search_tree(b, t, search);
- } else
- i = bset_search_write_set(b, t, search);
+ i = bset_search_tree(t, search);
+ } else {
+ BUG_ON(!b->nsets &&
+ t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
+
+ i = bset_search_write_set(t, search);
+ }
- if (expensive_debug_checks(b->c)) {
+ if (btree_keys_expensive_checks(b)) {
BUG_ON(bset_written(b, t) &&
i.l != t->data->start &&
bkey_cmp(tree_to_prev_bkey(t,
inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
search) > 0);
- BUG_ON(i.r != end(t->data) &&
+ BUG_ON(i.r != bset_bkey_last(t->data) &&
bkey_cmp(i.r, search) <= 0);
}
@@ -860,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
return i.l;
}
+EXPORT_SYMBOL(__bch_bset_search);
/* Btree iterator */
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+ struct btree_iter_set);
+
static inline bool btree_iter_cmp(struct btree_iter_set l,
struct btree_iter_set r)
{
- int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
-
- return c ? c > 0 : l.k < r.k;
+ return bkey_cmp(l.k, r.k) > 0;
}
static inline bool btree_iter_end(struct btree_iter *iter)
@@ -892,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
btree_iter_cmp));
}
-struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
- struct bkey *search, struct bset_tree *start)
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bkey *search,
+ struct bset_tree *start)
{
struct bkey *ret = NULL;
iter->size = ARRAY_SIZE(iter->data);
@@ -903,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
iter->b = b;
#endif
- for (; start <= &b->sets[b->nsets]; start++) {
+ for (; start <= bset_tree_last(b); start++) {
ret = bch_bset_search(b, start, search);
- bch_btree_iter_push(iter, ret, end(start->data));
+ bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
}
return ret;
}
-struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bkey *search)
+{
+ return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+ btree_iter_cmp_fn *cmp)
{
struct btree_iter_set unused;
struct bkey *ret = NULL;
@@ -928,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
}
if (iter->data->k == iter->data->end)
- heap_pop(iter, unused, btree_iter_cmp);
+ heap_pop(iter, unused, cmp);
else
- heap_sift(iter, 0, btree_iter_cmp);
+ heap_sift(iter, 0, cmp);
}
return ret;
}
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+ return __bch_btree_iter_next(iter, btree_iter_cmp);
+
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
+
struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
- struct btree *b, ptr_filter_fn fn)
+ struct btree_keys *b, ptr_filter_fn fn)
{
struct bkey *ret;
@@ -950,79 +1114,50 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
/* Mergesort */
-static void sort_key_next(struct btree_iter *iter,
- struct btree_iter_set *i)
+void bch_bset_sort_state_free(struct bset_sort_state *state)
{
- i->k = bkey_next(i->k);
-
- if (i->k == i->end)
- *i = iter->data[--iter->used];
+ if (state->pool)
+ mempool_destroy(state->pool);
}
-static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp)
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
{
- while (iter->used > 1) {
- struct btree_iter_set *top = iter->data, *i = top + 1;
+ spin_lock_init(&state->time.lock);
- if (iter->used > 2 &&
- btree_iter_cmp(i[0], i[1]))
- i++;
-
- if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
- break;
+ state->page_order = page_order;
+ state->crit_factor = int_sqrt(1 << page_order);
- if (!KEY_SIZE(i->k)) {
- sort_key_next(iter, i);
- heap_sift(iter, i - top, btree_iter_cmp);
- continue;
- }
-
- if (top->k > i->k) {
- if (bkey_cmp(top->k, i->k) >= 0)
- sort_key_next(iter, i);
- else
- bch_cut_front(top->k, i->k);
-
- heap_sift(iter, i - top, btree_iter_cmp);
- } else {
- /* can't happen because of comparison func */
- BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
-
- if (bkey_cmp(i->k, top->k) < 0) {
- bkey_copy(tmp, top->k);
-
- bch_cut_back(&START_KEY(i->k), tmp);
- bch_cut_front(i->k, top->k);
- heap_sift(iter, 0, btree_iter_cmp);
-
- return tmp;
- } else {
- bch_cut_back(&START_KEY(i->k), top->k);
- }
- }
- }
+ state->pool = mempool_create_page_pool(1, page_order);
+ if (!state->pool)
+ return -ENOMEM;
- return NULL;
+ return 0;
}
+EXPORT_SYMBOL(bch_bset_sort_state_init);
-static void btree_mergesort(struct btree *b, struct bset *out,
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
bool fixup, bool remove_stale)
{
+ int i;
struct bkey *k, *last = NULL;
BKEY_PADDED(k) tmp;
- bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+ bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
+ /* Heapify the iterator, using our comparison function */
+ for (i = iter->used / 2 - 1; i >= 0; --i)
+ heap_sift(iter, i, b->ops->sort_cmp);
+
while (!btree_iter_end(iter)) {
- if (fixup && !b->level)
- k = btree_sort_fixup(iter, &tmp.k);
+ if (b->ops->sort_fixup && fixup)
+ k = b->ops->sort_fixup(iter, &tmp.k);
else
k = NULL;
if (!k)
- k = bch_btree_iter_next(iter);
+ k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
if (bad(b, k))
continue;
@@ -1030,8 +1165,7 @@ static void btree_mergesort(struct btree *b, struct bset *out,
if (!last) {
last = out->start;
bkey_copy(last, k);
- } else if (b->level ||
- !bch_bkey_try_merge(b, last, k)) {
+ } else if (!bch_bkey_try_merge(b, last, k)) {
last = bkey_next(last);
bkey_copy(last, k);
}
@@ -1042,27 +1176,30 @@ static void btree_mergesort(struct btree *b, struct bset *out,
pr_debug("sorted %i keys", out->keys);
}
-static void __btree_sort(struct btree *b, struct btree_iter *iter,
- unsigned start, unsigned order, bool fixup)
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
+ unsigned start, unsigned order, bool fixup,
+ struct bset_sort_state *state)
{
uint64_t start_time;
- bool remove_stale = !b->written;
+ bool used_mempool = false;
struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
order);
if (!out) {
- mutex_lock(&b->c->sort_lock);
- out = b->c->sort;
- order = ilog2(bucket_pages(b->c));
+ struct page *outp;
+
+ BUG_ON(order > state->page_order);
+
+ outp = mempool_alloc(state->pool, GFP_NOIO);
+ out = page_address(outp);
+ used_mempool = true;
+ order = state->page_order;
}
start_time = local_clock();
- btree_mergesort(b, out, iter, fixup, remove_stale);
+ btree_mergesort(b, out, iter, fixup, false);
b->nsets = start;
- if (!fixup && !start && b->written)
- bch_btree_verify(b, out);
-
if (!start && order == b->page_order) {
/*
* Our temporary buffer is the same size as the btree node's
@@ -1070,84 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
* memcpy()
*/
- out->magic = bset_magic(&b->c->sb);
- out->seq = b->sets[0].data->seq;
- out->version = b->sets[0].data->version;
- swap(out, b->sets[0].data);
-
- if (b->c->sort == b->sets[0].data)
- b->c->sort = out;
+ out->magic = b->set->data->magic;
+ out->seq = b->set->data->seq;
+ out->version = b->set->data->version;
+ swap(out, b->set->data);
} else {
- b->sets[start].data->keys = out->keys;
- memcpy(b->sets[start].data->start, out->start,
- (void *) end(out) - (void *) out->start);
+ b->set[start].data->keys = out->keys;
+ memcpy(b->set[start].data->start, out->start,
+ (void *) bset_bkey_last(out) - (void *) out->start);
}
- if (out == b->c->sort)
- mutex_unlock(&b->c->sort_lock);
+ if (used_mempool)
+ mempool_free(virt_to_page(out), state->pool);
else
free_pages((unsigned long) out, order);
- if (b->written)
- bset_build_written_tree(b);
+ bch_bset_build_written_tree(b);
if (!start)
- bch_time_stats_update(&b->c->sort_time, start_time);
+ bch_time_stats_update(&state->time, start_time);
}
-void bch_btree_sort_partial(struct btree *b, unsigned start)
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+ struct bset_sort_state *state)
{
size_t order = b->page_order, keys = 0;
struct btree_iter iter;
int oldsize = bch_count_data(b);
- __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
-
- BUG_ON(b->sets[b->nsets].data == write_block(b) &&
- (b->sets[b->nsets].size || b->nsets));
-
+ __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
if (start) {
unsigned i;
for (i = start; i <= b->nsets; i++)
- keys += b->sets[i].data->keys;
+ keys += b->set[i].data->keys;
- order = roundup_pow_of_two(__set_bytes(b->sets->data,
- keys)) / PAGE_SIZE;
- if (order)
- order = ilog2(order);
+ order = get_order(__set_bytes(b->set->data, keys));
}
- __btree_sort(b, &iter, start, order, false);
+ __btree_sort(b, &iter, start, order, false, state);
- EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
+ EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
+EXPORT_SYMBOL(bch_btree_sort_partial);
-void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bset_sort_state *state)
{
- BUG_ON(!b->written);
- __btree_sort(b, iter, 0, b->page_order, true);
+ __btree_sort(b, iter, 0, b->page_order, true, state);
}
-void bch_btree_sort_into(struct btree *b, struct btree *new)
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+ struct bset_sort_state *state)
{
uint64_t start_time = local_clock();
struct btree_iter iter;
bch_btree_iter_init(b, &iter, NULL);
- btree_mergesort(b, new->sets->data, &iter, false, true);
+ btree_mergesort(b, new->set->data, &iter, false, true);
- bch_time_stats_update(&b->c->sort_time, start_time);
+ bch_time_stats_update(&state->time, start_time);
- bkey_copy_key(&new->key, &b->key);
- new->sets->size = 0;
+ new->set->size = 0; // XXX: why?
}
#define SORT_CRIT (4096 / sizeof(uint64_t))
-void bch_btree_sort_lazy(struct btree *b)
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
{
unsigned crit = SORT_CRIT;
int i;
@@ -1156,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b)
if (!b->nsets)
goto out;
- /* If not a leaf node, always sort */
- if (b->level) {
- bch_btree_sort(b);
- return;
- }
-
for (i = b->nsets - 1; i >= 0; --i) {
- crit *= b->c->sort_crit_factor;
+ crit *= state->crit_factor;
- if (b->sets[i].data->keys < crit) {
- bch_btree_sort_partial(b, i);
+ if (b->set[i].data->keys < crit) {
+ bch_btree_sort_partial(b, i, state);
return;
}
}
/* Sort if we'd overflow */
if (b->nsets + 1 == MAX_BSETS) {
- bch_btree_sort(b);
+ bch_btree_sort(b, state);
return;
}
out:
- bset_build_written_tree(b);
+ bch_bset_build_written_tree(b);
}
+EXPORT_SYMBOL(bch_btree_sort_lazy);
-/* Sysfs stuff */
-
-struct bset_stats {
- struct btree_op op;
- size_t nodes;
- size_t sets_written, sets_unwritten;
- size_t bytes_written, bytes_unwritten;
- size_t floats, failed;
-};
-
-static int btree_bset_stats(struct btree_op *op, struct btree *b)
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
{
- struct bset_stats *stats = container_of(op, struct bset_stats, op);
unsigned i;
- stats->nodes++;
-
for (i = 0; i <= b->nsets; i++) {
- struct bset_tree *t = &b->sets[i];
+ struct bset_tree *t = &b->set[i];
size_t bytes = t->data->keys * sizeof(uint64_t);
size_t j;
@@ -1217,32 +1328,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
stats->bytes_unwritten += bytes;
}
}
-
- return MAP_CONTINUE;
-}
-
-int bch_bset_print_stats(struct cache_set *c, char *buf)
-{
- struct bset_stats t;
- int ret;
-
- memset(&t, 0, sizeof(struct bset_stats));
- bch_btree_op_init(&t.op, -1);
-
- ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
- if (ret < 0)
- return ret;
-
- return snprintf(buf, PAGE_SIZE,
- "btree nodes: %zu\n"
- "written sets: %zu\n"
- "unwritten sets: %zu\n"
- "written key bytes: %zu\n"
- "unwritten key bytes: %zu\n"
- "floats: %zu\n"
- "failed: %zu\n",
- t.nodes,
- t.sets_written, t.sets_unwritten,
- t.bytes_written, t.bytes_unwritten,
- t.floats, t.failed);
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1d3c24f9fa0..5f6728d5d4d 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,7 +1,11 @@
#ifndef _BCACHE_BSET_H
#define _BCACHE_BSET_H
-#include <linux/slab.h>
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "util.h" /* for time_stats */
/*
* BKEYS:
@@ -142,20 +146,13 @@
* first key in that range of bytes again.
*/
-/* Btree key comparison/iteration */
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
#define MAX_BSETS 4U
-struct btree_iter {
- size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
- struct btree *b;
-#endif
- struct btree_iter_set {
- struct bkey *k, *end;
- } data[MAX_BSETS];
-};
-
struct bset_tree {
/*
* We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
*/
/* size of the binary tree and prev array */
- unsigned size;
+ unsigned size;
/* function of size - precalculated for to_inorder() */
- unsigned extra;
+ unsigned extra;
/* copy of the last key in the set */
- struct bkey end;
- struct bkey_float *tree;
+ struct bkey end;
+ struct bkey_float *tree;
/*
* The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
* to keep bkey_float to 4 bytes and prev isn't used in the fast
* path.
*/
- uint8_t *prev;
+ uint8_t *prev;
/* The actual btree node, with pointers to each sorted set */
- struct bset *data;
+ struct bset *data;
};
+struct btree_keys_ops {
+ bool (*sort_cmp)(struct btree_iter_set,
+ struct btree_iter_set);
+ struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
+ bool (*insert_fixup)(struct btree_keys *, struct bkey *,
+ struct btree_iter *, struct bkey *);
+ bool (*key_invalid)(struct btree_keys *,
+ const struct bkey *);
+ bool (*key_bad)(struct btree_keys *, const struct bkey *);
+ bool (*key_merge)(struct btree_keys *,
+ struct bkey *, struct bkey *);
+ void (*key_to_text)(char *, size_t, const struct bkey *);
+ void (*key_dump)(struct btree_keys *, const struct bkey *);
+
+ /*
+ * Only used for deciding whether to use START_KEY(k) or just the key
+ * itself in a couple places
+ */
+ bool is_extents;
+};
+
+struct btree_keys {
+ const struct btree_keys_ops *ops;
+ uint8_t page_order;
+ uint8_t nsets;
+ unsigned last_set_unwritten:1;
+ bool *expensive_debug_checks;
+
+ /*
+ * Sets of sorted keys - the real btree node - plus a binary search tree
+ *
+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+ * to the memory we have allocated for this btree node. Additionally,
+ * set[0]->data points to the entire btree node as it exists on disk.
+ */
+ struct bset_tree set[MAX_BSETS];
+};
+
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+ return b->set + b->nsets;
+}
+
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+ return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+ return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+ return ((size_t) i) - ((size_t) b->set->data);
+}
+
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+ return bset_byte_offset(b, i) >> 9;
+}
+
+#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i) __set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes) \
+ DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes) \
+ __set_blocks(i, (i)->keys, block_bytes)
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON((PAGE_SIZE << b->page_order) <
+ (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+
+ if (!b->last_set_unwritten)
+ return 0;
+
+ return ((PAGE_SIZE << b->page_order) -
+ (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+ sizeof(u64);
+}
+
+static inline struct bset *bset_next_set(struct btree_keys *b,
+ unsigned block_bytes)
+{
+ struct bset *i = bset_tree_last(b)->data;
+
+ return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+ bool *);
+
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+ struct bkey *);
+
+enum {
+ BTREE_INSERT_STATUS_NO_INSERT = 0,
+ BTREE_INSERT_STATUS_INSERT,
+ BTREE_INSERT_STATUS_BACK_MERGE,
+ BTREE_INSERT_STATUS_OVERWROTE,
+ BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
+/* Btree key iteration */
+
+struct btree_iter {
+ size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+ struct btree_keys *b;
+#endif
+ struct btree_iter_set {
+ struct bkey *k, *end;
+ } data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+ struct btree_keys *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+ struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+ const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+ struct bset_tree *t,
+ const struct bkey *search)
+{
+ return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+#define for_each_key_filter(b, k, iter, filter) \
+ for (bch_btree_iter_init((b), (iter), NULL); \
+ ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+
+#define for_each_key(b, k, iter) \
+ for (bch_btree_iter_init((b), (iter), NULL); \
+ ((k) = bch_btree_iter_next(iter));)
+
+/* Sorting */
+
+struct bset_sort_state {
+ mempool_t *pool;
+
+ unsigned page_order;
+ unsigned crit_factor;
+
+ struct time_stats time;
+};
+
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+ struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+ struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+ struct bset_sort_state *);
+
+static inline void bch_btree_sort(struct btree_keys *b,
+ struct bset_sort_state *state)
+{
+ bch_btree_sort_partial(b, 0, state);
+}
+
+struct bset_stats {
+ size_t sets_written, sets_unwritten;
+ size_t bytes_written, bytes_unwritten;
+ size_t floats, failed;
+};
+
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+
+/* Bkey utility code */
+
+#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+ return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+ *k = ZERO_KEY;
+}
+
static __always_inline int64_t bkey_cmp(const struct bkey *l,
const struct bkey *r)
{
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
}
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+ unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, k) > 0);
+ return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+ return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k) \
+({ \
+ struct bkey *_ret = NULL; \
+ \
+ if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
+ _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
+ \
+ if (!_ret->low) \
+ _ret->high--; \
+ _ret->low--; \
+ } \
+ \
+ _ret; \
+})
+
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+ return b->ops->key_invalid(b, k);
+}
+
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+ return b->ops->key_bad(b, k);
+}
+
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+ size_t size, const struct bkey *k)
+{
+ return b->ops->key_to_text(buf, size, k);
+}
+
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+ const struct bkey *r)
+{
+ return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+ KEY_PTRS(l) == KEY_PTRS(r) &&
+ KEY_CSUM(l) == KEY_CSUM(l));
+}
+
/* Keylists */
struct keylist {
@@ -218,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
l->top_p = l->keys_p = l->inline_keys;
}
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+ l->keys = k;
+ l->top = bkey_next(k);
+}
+
static inline void bch_keylist_push(struct keylist *l)
{
l->top = bkey_next(l->top);
@@ -257,136 +523,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
struct bkey *bch_keylist_pop(struct keylist *);
void bch_keylist_pop_front(struct keylist *);
-int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
-
-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
- unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
-
-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
-{
- BUG_ON(bkey_cmp(where, k) > 0);
- return __bch_cut_front(where, k);
-}
-
-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
-{
- BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
- return __bch_cut_back(where, k);
-}
-
-const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
-
-bool bch_ptr_bad(struct btree *, const struct bkey *);
-
-static inline uint8_t gen_after(uint8_t a, uint8_t b)
-{
- uint8_t r = a - b;
- return r > 128U ? 0 : r;
-}
-
-static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
- unsigned i)
-{
- return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
-}
-
-static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
- unsigned i)
-{
- return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
-}
-
+int __bch_keylist_realloc(struct keylist *, unsigned);
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
- struct btree *, ptr_filter_fn);
-
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
- struct bkey *, struct bset_tree *);
-
-/* 32 bits total: */
-#define BKEY_MID_BITS 3
-#define BKEY_EXPONENT_BITS 7
-#define BKEY_MANTISSA_BITS 22
-#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
-
-struct bkey_float {
- unsigned exponent:BKEY_EXPONENT_BITS;
- unsigned m:BKEY_MID_BITS;
- unsigned mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
+/* Debug stuff */
-#define BSET_CACHELINE 128
-#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
+#ifdef CONFIG_BCACHE_DEBUG
-#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float))
-#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t))
+int __bch_count_data(struct btree_keys *);
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
-void bch_bset_init_next(struct btree *);
+#else
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
- const struct bkey *);
+#endif
-/*
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
- const struct bkey *search)
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
{
- return search ? __bch_bset_search(b, t, search) : t->data->start;
+#ifdef CONFIG_BCACHE_DEBUG
+ return *b->expensive_debug_checks;
+#else
+ return false;
+#endif
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
-
-bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
-void bch_btree_sort_lazy(struct btree *);
-void bch_btree_sort_into(struct btree *, struct btree *);
-void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
-void bch_btree_sort_partial(struct btree *, unsigned);
-
-static inline void bch_btree_sort(struct btree *b)
+static inline int bch_count_data(struct btree_keys *b)
{
- bch_btree_sort_partial(b, 0);
+ return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
}
-int bch_bset_print_stats(struct cache_set *, char *);
+#define bch_check_keys(b, ...) \
+do { \
+ if (btree_keys_expensive_checks(b)) \
+ __bch_check_keys(b, __VA_ARGS__); \
+} while (0)
#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 8e2573a009f..7347b610096 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
-#include "writeback.h"
+#include "extents.h"
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -68,15 +68,11 @@
* alloc_bucket() cannot fail. This should be true but is not completely
* obvious.
*
- * Make sure all allocations get charged to the root cgroup
- *
* Plugging?
*
* If data write is less than hard sector size of ssd, round up offset in open
* bucket to the next whole sector
*
- * Also lookup by cgroup in get_open_bucket()
- *
* Superblock needs to be fleshed out for multiple cache devices
*
* Add a sysfs tunable for the number of writeback IOs in flight
@@ -89,13 +85,6 @@
* Test module load/unload
*/
-enum {
- BTREE_INSERT_STATUS_INSERT,
- BTREE_INSERT_STATUS_BACK_MERGE,
- BTREE_INSERT_STATUS_OVERWROTE,
- BTREE_INSERT_STATUS_FRONT_MERGE,
-};
-
#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
@@ -104,16 +93,6 @@ enum {
#define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
-static struct workqueue_struct *btree_io_wq;
-
-static inline bool should_split(struct btree *b)
-{
- struct bset *i = write_block(b);
- return b->written >= btree_blocks(b) ||
- (b->written + __set_blocks(i, i->keys + 15, b->c)
- > btree_blocks(b));
-}
-
#define insert_lock(s, b) ((b)->level <= (s)->lock)
/*
@@ -138,7 +117,7 @@ static inline bool should_split(struct btree *b)
({ \
int _r, l = (b)->level - 1; \
bool _w = l <= (op)->lock; \
- struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
+ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
if (!IS_ERR(_child)) { \
_child->parent = (b); \
_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
@@ -167,20 +146,34 @@ static inline bool should_split(struct btree *b)
_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
} \
rw_unlock(_w, _b); \
+ bch_cannibalize_unlock(c); \
if (_r == -EINTR) \
schedule(); \
- bch_cannibalize_unlock(c); \
- if (_r == -ENOSPC) { \
- wait_event((c)->try_wait, \
- !(c)->try_harder); \
- _r = -EINTR; \
- } \
} while (_r == -EINTR); \
\
- finish_wait(&(c)->bucket_wait, &(op)->wait); \
+ finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
_r; \
})
+static inline struct bset *write_block(struct btree *b)
+{
+ return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
+
+static void bch_btree_init_next(struct btree *b)
+{
+ /* If not a leaf node, always sort */
+ if (b->level && b->keys.nsets)
+ bch_btree_sort(&b->keys, &b->c->sort);
+ else
+ bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+ if (b->written < btree_blocks(b))
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->sb));
+
+}
+
/* Btree key manipulation */
void bkey_put(struct cache_set *c, struct bkey *k)
@@ -197,16 +190,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{
uint64_t crc = b->key.ptr[0];
- void *data = (void *) i + 8, *end = end(i);
+ void *data = (void *) i + 8, *end = bset_bkey_last(i);
crc = bch_crc64_update(crc, data, end - data);
return crc ^ 0xffffffffffffffffULL;
}
-static void bch_btree_node_read_done(struct btree *b)
+void bch_btree_node_read_done(struct btree *b)
{
const char *err = "bad btree header";
- struct bset *i = b->sets[0].data;
+ struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -214,21 +207,22 @@ static void bch_btree_node_read_done(struct btree *b)
iter->used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter->b = b;
+ iter->b = &b->keys;
#endif
if (!i->seq)
goto err;
for (;
- b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+ b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
i = write_block(b)) {
err = "unsupported bset version";
if (i->version > BCACHE_BSET_VERSION)
goto err;
err = "bad btree header";
- if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+ if (b->written + set_blocks(i, block_bytes(b->c)) >
+ btree_blocks(b))
goto err;
err = "bad magic";
@@ -248,31 +242,32 @@ static void bch_btree_node_read_done(struct btree *b)
}
err = "empty set";
- if (i != b->sets[0].data && !i->keys)
+ if (i != b->keys.set[0].data && !i->keys)
goto err;
- bch_btree_iter_push(iter, i->start, end(i));
+ bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
- b->written += set_blocks(i, b->c);
+ b->written += set_blocks(i, block_bytes(b->c));
}
err = "corrupted btree";
for (i = write_block(b);
- bset_sector_offset(b, i) < KEY_SIZE(&b->key);
+ bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
i = ((void *) i) + block_bytes(b->c))
- if (i->seq == b->sets[0].data->seq)
+ if (i->seq == b->keys.set[0].data->seq)
goto err;
- bch_btree_sort_and_fix_extents(b, iter);
+ bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
- i = b->sets[0].data;
+ i = b->keys.set[0].data;
err = "short btree key";
- if (b->sets[0].size &&
- bkey_cmp(&b->key, &b->sets[0].end) < 0)
+ if (b->keys.set[0].size &&
+ bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
goto err;
if (b->written < btree_blocks(b))
- bch_bset_init_next(b);
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->sb));
out:
mempool_free(iter, b->c->fill_iter);
return;
@@ -290,7 +285,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
closure_put(cl);
}
-void bch_btree_node_read(struct btree *b)
+static void bch_btree_node_read(struct btree *b)
{
uint64_t start_time = local_clock();
struct closure cl;
@@ -306,7 +301,7 @@ void bch_btree_node_read(struct btree *b)
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = &cl;
- bch_bio_map(bio, b->sets[0].data);
+ bch_bio_map(bio, b->keys.set[0].data);
bch_submit_bbio(bio, b->c, &b->key, 0);
closure_sync(&cl);
@@ -360,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
btree_complete_write(b, w);
if (btree_node_dirty(b))
- queue_delayed_work(btree_io_wq, &b->work,
- msecs_to_jiffies(30000));
+ schedule_delayed_work(&b->work, 30 * HZ);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
@@ -393,7 +387,7 @@ static void btree_node_write_endio(struct bio *bio, int error)
static void do_btree_node_write(struct btree *b)
{
struct closure *cl = &b->io;
- struct bset *i = b->sets[b->nsets].data;
+ struct bset *i = btree_bset_last(b);
BKEY_PADDED(key) k;
i->version = BCACHE_BSET_VERSION;
@@ -405,7 +399,7 @@ static void do_btree_node_write(struct btree *b)
b->bio->bi_end_io = btree_node_write_endio;
b->bio->bi_private = cl;
b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
- b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+ b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
bch_bio_map(b->bio, i);
/*
@@ -424,7 +418,8 @@ static void do_btree_node_write(struct btree *b)
*/
bkey_copy(&k.key, &b->key);
- SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+ SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+ bset_sector_offset(&b->keys, i));
if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
int j;
@@ -449,17 +444,19 @@ static void do_btree_node_write(struct btree *b)
}
}
-void bch_btree_node_write(struct btree *b, struct closure *parent)
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
{
- struct bset *i = b->sets[b->nsets].data;
+ struct bset *i = btree_bset_last(b);
+
+ lockdep_assert_held(&b->write_lock);
trace_bcache_btree_write(b);
BUG_ON(current->bio_list);
BUG_ON(b->written >= btree_blocks(b));
BUG_ON(b->written && !i->keys);
- BUG_ON(b->sets->data->seq != i->seq);
- bch_check_keys(b, "writing");
+ BUG_ON(btree_bset_first(b)->seq != i->seq);
+ bch_check_keys(&b->keys, "writing");
cancel_delayed_work(&b->work);
@@ -472,14 +469,28 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
do_btree_node_write(b);
- b->written += set_blocks(i, b->c);
- atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+ atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
- bch_btree_sort_lazy(b);
+ b->written += set_blocks(i, block_bytes(b->c));
+}
- if (b->written < btree_blocks(b))
- bch_bset_init_next(b);
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+ unsigned nsets = b->keys.nsets;
+
+ lockdep_assert_held(&b->lock);
+
+ __bch_btree_node_write(b, parent);
+
+ /*
+ * do verify if there was more than one set initially (i.e. we did a
+ * sort) and we sorted down to a single set:
+ */
+ if (nsets && !b->keys.nsets)
+ bch_btree_verify(b);
+
+ bch_btree_init_next(b);
}
static void bch_btree_node_write_sync(struct btree *b)
@@ -487,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
struct closure cl;
closure_init_stack(&cl);
+
+ mutex_lock(&b->write_lock);
bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
closure_sync(&cl);
}
@@ -495,23 +510,24 @@ static void btree_node_write_work(struct work_struct *w)
{
struct btree *b = container_of(to_delayed_work(w), struct btree, work);
- rw_lock(true, b, b->level);
-
+ mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
- bch_btree_node_write(b, NULL);
- rw_unlock(true, b);
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
}
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
{
- struct bset *i = b->sets[b->nsets].data;
+ struct bset *i = btree_bset_last(b);
struct btree_write *w = btree_current_write(b);
+ lockdep_assert_held(&b->write_lock);
+
BUG_ON(!b->written);
BUG_ON(!i->keys);
if (!btree_node_dirty(b))
- queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+ schedule_delayed_work(&b->work, 30 * HZ);
set_btree_node_dirty(b);
@@ -539,54 +555,19 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
* mca -> memory cache
*/
-static void mca_reinit(struct btree *b)
-{
- unsigned i;
-
- b->flags = 0;
- b->written = 0;
- b->nsets = 0;
-
- for (i = 0; i < MAX_BSETS; i++)
- b->sets[i].size = 0;
- /*
- * Second loop starts at 1 because b->sets[0]->data is the memory we
- * allocated
- */
- for (i = 1; i < MAX_BSETS; i++)
- b->sets[i].data = NULL;
-}
-
#define mca_reserve(c) (((c->root && c->root->level) \
? c->root->level : 1) * 8 + 16)
#define mca_can_free(c) \
- max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+ max_t(int, 0, c->btree_cache_used - mca_reserve(c))
static void mca_data_free(struct btree *b)
{
- struct bset_tree *t = b->sets;
-
BUG_ON(b->io_mutex.count != 1);
- if (bset_prev_bytes(b) < PAGE_SIZE)
- kfree(t->prev);
- else
- free_pages((unsigned long) t->prev,
- get_order(bset_prev_bytes(b)));
-
- if (bset_tree_bytes(b) < PAGE_SIZE)
- kfree(t->tree);
- else
- free_pages((unsigned long) t->tree,
- get_order(bset_tree_bytes(b)));
+ bch_btree_keys_free(&b->keys);
- free_pages((unsigned long) t->data, b->page_order);
-
- t->prev = NULL;
- t->tree = NULL;
- t->data = NULL;
+ b->c->btree_cache_used--;
list_move(&b->list, &b->c->btree_cache_freed);
- b->c->bucket_cache_used--;
}
static void mca_bucket_free(struct btree *b)
@@ -605,34 +586,16 @@ static unsigned btree_order(struct bkey *k)
static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{
- struct bset_tree *t = b->sets;
- BUG_ON(t->data);
-
- b->page_order = max_t(unsigned,
- ilog2(b->c->btree_pages),
- btree_order(k));
-
- t->data = (void *) __get_free_pages(gfp, b->page_order);
- if (!t->data)
- goto err;
-
- t->tree = bset_tree_bytes(b) < PAGE_SIZE
- ? kmalloc(bset_tree_bytes(b), gfp)
- : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
- if (!t->tree)
- goto err;
-
- t->prev = bset_prev_bytes(b) < PAGE_SIZE
- ? kmalloc(bset_prev_bytes(b), gfp)
- : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
- if (!t->prev)
- goto err;
-
- list_move(&b->list, &b->c->btree_cache);
- b->c->bucket_cache_used++;
- return;
-err:
- mca_data_free(b);
+ if (!bch_btree_keys_alloc(&b->keys,
+ max_t(unsigned,
+ ilog2(b->c->btree_pages),
+ btree_order(k)),
+ gfp)) {
+ b->c->btree_cache_used++;
+ list_move(&b->list, &b->c->btree_cache);
+ } else {
+ list_move(&b->list, &b->c->btree_cache_freed);
+ }
}
static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -644,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock);
+ mutex_init(&b->write_lock);
+ lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list);
INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
@@ -663,9 +628,9 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
if (!down_write_trylock(&b->lock))
return -ENOMEM;
- BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+ BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
- if (b->page_order < min_order)
+ if (b->keys.page_order < min_order)
goto out_unlock;
if (!flush) {
@@ -677,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
up(&b->io_mutex);
}
+ mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
- bch_btree_node_write_sync(b);
+ __bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
+ closure_sync(&cl);
/* wait for any in flight btree write */
down(&b->io_mutex);
@@ -701,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
if (c->shrinker_disabled)
return SHRINK_STOP;
- if (c->try_harder)
+ if (c->btree_cache_alloc_lock)
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
@@ -733,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
}
}
- for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+ for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
if (list_empty(&c->btree_cache))
goto out;
@@ -762,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
if (c->shrinker_disabled)
return 0;
- if (c->try_harder)
+ if (c->btree_cache_alloc_lock)
return 0;
return mca_can_free(c) * c->btree_pages;
@@ -782,6 +751,8 @@ void bch_btree_cache_free(struct cache_set *c)
#ifdef CONFIG_BCACHE_DEBUG
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
+
+ free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
#endif
list_splice(&c->btree_cache_freeable,
@@ -822,10 +793,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
#ifdef CONFIG_BCACHE_DEBUG
mutex_init(&c->verify_lock);
+ c->verify_ondisk = (void *)
+ __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
if (c->verify_data &&
- c->verify_data->sets[0].data)
+ c->verify_data->keys.set->data)
list_del_init(&c->verify_data->list);
else
c->verify_data = NULL;
@@ -861,17 +835,30 @@ out:
return b;
}
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+ struct task_struct *old;
+
+ old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+ if (old && old != current) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+ struct bkey *k)
{
struct btree *b;
trace_bcache_btree_cache_cannibalize(c);
- if (!c->try_harder) {
- c->try_harder = current;
- c->try_harder_start = local_clock();
- } else if (c->try_harder != current)
- return ERR_PTR(-ENOSPC);
+ if (mca_cannibalize_lock(c, op))
+ return ERR_PTR(-EINTR);
list_for_each_entry_reverse(b, &c->btree_cache, list)
if (!mca_reap(b, btree_order(k), false))
@@ -881,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
if (!mca_reap(b, btree_order(k), true))
return b;
+ WARN(1, "btree cache cannibalize failed\n");
return ERR_PTR(-ENOMEM);
}
@@ -892,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
*/
static void bch_cannibalize_unlock(struct cache_set *c)
{
- if (c->try_harder == current) {
- bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
- c->try_harder = NULL;
- wake_up(&c->try_wait);
+ if (c->btree_cache_alloc_lock == current) {
+ c->btree_cache_alloc_lock = NULL;
+ wake_up(&c->btree_cache_wait);
}
}
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level)
{
struct btree *b;
@@ -923,7 +911,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
list_for_each_entry(b, &c->btree_cache_freed, list)
if (!mca_reap(b, 0, false)) {
mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
- if (!b->sets[0].data)
+ if (!b->keys.set[0].data)
goto err;
else
goto out;
@@ -934,7 +922,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
goto err;
BUG_ON(!down_write_trylock(&b->lock));
- if (!b->sets->data)
+ if (!b->keys.set->data)
goto err;
out:
BUG_ON(b->io_mutex.count != 1);
@@ -945,17 +933,24 @@ out:
hlist_add_head_rcu(&b->hash, mca_hash(c, k));
lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
- b->level = level;
b->parent = (void *) ~0UL;
+ b->flags = 0;
+ b->written = 0;
+ b->level = level;
- mca_reinit(b);
+ if (!b->level)
+ bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+ &b->c->expensive_debug_checks);
+ else
+ bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+ &b->c->expensive_debug_checks);
return b;
err:
if (b)
rw_unlock(true, b);
- b = mca_cannibalize(c, k);
+ b = mca_cannibalize(c, op, k);
if (!IS_ERR(b))
goto out;
@@ -971,8 +966,8 @@ err:
* The btree node will have either a read or a write lock held, depending on
* level and op->lock.
*/
-struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
- int level, bool write)
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level, bool write)
{
int i = 0;
struct btree *b;
@@ -986,7 +981,7 @@ retry:
return ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
- b = mca_alloc(c, k, level);
+ b = mca_alloc(c, op, k, level);
mutex_unlock(&c->bucket_lock);
if (!b)
@@ -1009,13 +1004,13 @@ retry:
b->accessed = 1;
- for (; i <= b->nsets && b->sets[i].size; i++) {
- prefetch(b->sets[i].tree);
- prefetch(b->sets[i].data);
+ for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
+ prefetch(b->keys.set[i].tree);
+ prefetch(b->keys.set[i].data);
}
- for (; i <= b->nsets; i++)
- prefetch(b->sets[i].data);
+ for (; i <= b->keys.nsets; i++)
+ prefetch(b->keys.set[i].data);
if (btree_node_io_error(b)) {
rw_unlock(write, b);
@@ -1032,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
struct btree *b;
mutex_lock(&c->bucket_lock);
- b = mca_alloc(c, k, level);
+ b = mca_alloc(c, NULL, k, level);
mutex_unlock(&c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) {
@@ -1045,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
static void btree_node_free(struct btree *b)
{
- unsigned i;
-
trace_bcache_btree_node_free(b);
BUG_ON(b == b->c->root);
+ mutex_lock(&b->write_lock);
+
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
+ mutex_unlock(&b->write_lock);
+
cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock);
-
- for (i = 0; i < KEY_PTRS(&b->key); i++) {
- BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
-
- bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
- PTR_BUCKET(b->c, &b->key, i));
- }
-
bch_bucket_free(b->c, &b->key);
mca_bucket_free(b);
mutex_unlock(&b->c->bucket_lock);
}
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ int level)
{
BKEY_PADDED(key) k;
struct btree *b = ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
retry:
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
goto err;
bkey_put(c, &k.key);
SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
- b = mca_alloc(c, &k.key, level);
+ b = mca_alloc(c, op, &k.key, level);
if (IS_ERR(b))
goto err_free;
@@ -1095,7 +1085,7 @@ retry:
}
b->accessed = 1;
- bch_bset_init_next(b);
+ bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
mutex_unlock(&c->bucket_lock);
@@ -1110,11 +1100,16 @@ err:
return b;
}
-static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+ struct btree_op *op)
{
- struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
- if (!IS_ERR_OR_NULL(n))
- bch_btree_sort_into(b, n);
+ struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+ if (!IS_ERR_OR_NULL(n)) {
+ mutex_lock(&n->write_lock);
+ bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+ bkey_copy_key(&n->key, &b->key);
+ mutex_unlock(&n->write_lock);
+ }
return n;
}
@@ -1123,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{
unsigned i;
+ mutex_lock(&b->c->bucket_lock);
+
+ atomic_inc(&b->c->prio_blocked);
+
bkey_copy(k, &b->key);
bkey_copy_key(k, &ZERO_KEY);
- for (i = 0; i < KEY_PTRS(k); i++) {
- uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
-
- SET_PTR_GEN(k, i, g);
- }
+ for (i = 0; i < KEY_PTRS(k); i++)
+ SET_PTR_GEN(k, i,
+ bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+ PTR_BUCKET(b->c, &b->key, i)));
- atomic_inc(&b->c->prio_blocked);
+ mutex_unlock(&b->c->bucket_lock);
}
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
struct cache_set *c = b->c;
struct cache *ca;
- unsigned i, reserve = c->root->level * 2 + 1;
- int ret = 0;
+ unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
if (op)
- prepare_to_wait(&c->bucket_wait, &op->wait,
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
- ret = -EINTR;
- break;
+ mutex_unlock(&c->bucket_lock);
+ return -EINTR;
}
mutex_unlock(&c->bucket_lock);
- return ret;
+
+ return mca_cannibalize_lock(b->c, op);
}
/* Garbage collection */
-uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
+ struct bkey *k)
{
uint8_t stale = 0;
unsigned i;
@@ -1179,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
g = PTR_BUCKET(c, k, i);
- if (gen_after(g->gc_gen, PTR_GEN(k, i)))
- g->gc_gen = PTR_GEN(k, i);
+ if (gen_after(g->last_gc, PTR_GEN(k, i)))
+ g->last_gc = PTR_GEN(k, i);
if (ptr_stale(c, k, i)) {
stale = max(stale, ptr_stale(c, k, i));
@@ -1196,11 +1195,13 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
SET_GC_MARK(g, GC_MARK_METADATA);
else if (KEY_DIRTY(k))
SET_GC_MARK(g, GC_MARK_DIRTY);
+ else if (!GC_MARK(g))
+ SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
/* guard against overflow */
SET_GC_SECTORS_USED(g, min_t(unsigned,
GC_SECTORS_USED(g) + KEY_SIZE(k),
- (1 << 14) - 1));
+ MAX_GC_SECTORS_USED));
BUG_ON(!GC_SECTORS_USED(g));
}
@@ -1210,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
+void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i) &&
+ !ptr_stale(c, k, i)) {
+ struct bucket *b = PTR_BUCKET(c, k, i);
+
+ b->gen = PTR_GEN(k, i);
+
+ if (level && bkey_cmp(k, &ZERO_KEY))
+ b->prio = BTREE_PRIO;
+ else if (!level && b->prio == BTREE_PRIO)
+ b->prio = INITIAL_PRIO;
+ }
+
+ __bch_btree_mark_key(c, level, k);
+}
+
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{
uint8_t stale = 0;
@@ -1220,11 +1241,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
gc->nodes++;
- for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
stale = max(stale, btree_mark_key(b, k));
keys++;
- if (bch_ptr_bad(b, k))
+ if (bch_ptr_bad(&b->keys, k))
continue;
gc->key_bytes += bkey_u64s(k);
@@ -1234,9 +1255,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
gc->data += KEY_SIZE(k);
}
- for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+ for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
btree_bug_on(t->size &&
- bset_written(b, t) &&
+ bset_written(&b->keys, t) &&
bkey_cmp(&b->key, &t->end) < 0,
b, "found short btree key in gc");
@@ -1263,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
struct keylist *, atomic_t *, struct bkey *);
static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
- struct keylist *keylist, struct gc_stat *gc,
- struct gc_merge_info *r)
+ struct gc_stat *gc, struct gc_merge_info *r)
{
unsigned i, nodes = 0, keys = 0, blocks;
struct btree *new_nodes[GC_MERGE_NODES];
+ struct keylist keylist;
struct closure cl;
struct bkey *k;
+ bch_keylist_init(&keylist);
+
+ if (btree_check_reserve(b, NULL))
+ return 0;
+
memset(new_nodes, 0, sizeof(new_nodes));
closure_init_stack(&cl);
@@ -1280,28 +1306,42 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
blocks = btree_default_blocks(b->c) * 2 / 3;
if (nodes < 2 ||
- __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+ __set_blocks(b->keys.set[0].data, keys,
+ block_bytes(b->c)) > blocks * (nodes - 1))
return 0;
for (i = 0; i < nodes; i++) {
- new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
+ new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
if (IS_ERR_OR_NULL(new_nodes[i]))
goto out_nocoalesce;
}
+ /*
+ * We have to check the reserve here, after we've allocated our new
+ * nodes, to make sure the insert below will succeed - we also check
+ * before as an optimization to potentially avoid a bunch of expensive
+ * allocs/sorts
+ */
+ if (btree_check_reserve(b, NULL))
+ goto out_nocoalesce;
+
+ for (i = 0; i < nodes; i++)
+ mutex_lock(&new_nodes[i]->write_lock);
+
for (i = nodes - 1; i > 0; --i) {
- struct bset *n1 = new_nodes[i]->sets->data;
- struct bset *n2 = new_nodes[i - 1]->sets->data;
+ struct bset *n1 = btree_bset_first(new_nodes[i]);
+ struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
struct bkey *k, *last = NULL;
keys = 0;
if (i > 1) {
for (k = n2->start;
- k < end(n2);
+ k < bset_bkey_last(n2);
k = bkey_next(k)) {
if (__set_blocks(n1, n1->keys + keys +
- bkey_u64s(k), b->c) > blocks)
+ bkey_u64s(k),
+ block_bytes(b->c)) > blocks)
break;
last = k;
@@ -1317,7 +1357,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
* though)
*/
if (__set_blocks(n1, n1->keys + n2->keys,
- b->c) > btree_blocks(new_nodes[i]))
+ block_bytes(b->c)) >
+ btree_blocks(new_nodes[i]))
goto out_nocoalesce;
keys = n2->keys;
@@ -1325,47 +1366,54 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
last = &r->b->key;
}
- BUG_ON(__set_blocks(n1, n1->keys + keys,
- b->c) > btree_blocks(new_nodes[i]));
+ BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+ btree_blocks(new_nodes[i]));
if (last)
bkey_copy_key(&new_nodes[i]->key, last);
- memcpy(end(n1),
+ memcpy(bset_bkey_last(n1),
n2->start,
- (void *) node(n2, keys) - (void *) n2->start);
+ (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
n1->keys += keys;
r[i].keys = n1->keys;
memmove(n2->start,
- node(n2, keys),
- (void *) end(n2) - (void *) node(n2, keys));
+ bset_bkey_idx(n2, keys),
+ (void *) bset_bkey_last(n2) -
+ (void *) bset_bkey_idx(n2, keys));
n2->keys -= keys;
- if (bch_keylist_realloc(keylist,
- KEY_PTRS(&new_nodes[i]->key), b->c))
+ if (__bch_keylist_realloc(&keylist,
+ bkey_u64s(&new_nodes[i]->key)))
goto out_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl);
- bch_keylist_add(keylist, &new_nodes[i]->key);
+ bch_keylist_add(&keylist, &new_nodes[i]->key);
}
- for (i = 0; i < nodes; i++) {
- if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
- goto out_nocoalesce;
+ for (i = 0; i < nodes; i++)
+ mutex_unlock(&new_nodes[i]->write_lock);
- make_btree_freeing_key(r[i].b, keylist->top);
- bch_keylist_push(keylist);
- }
+ closure_sync(&cl);
/* We emptied out this node */
- BUG_ON(new_nodes[0]->sets->data->keys);
+ BUG_ON(btree_bset_first(new_nodes[0])->keys);
btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]);
- closure_sync(&cl);
+ for (i = 0; i < nodes; i++) {
+ if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
+ goto out_nocoalesce;
+
+ make_btree_freeing_key(r[i].b, keylist.top);
+ bch_keylist_push(&keylist);
+ }
+
+ bch_btree_insert_node(b, op, &keylist, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&keylist));
for (i = 0; i < nodes; i++) {
btree_node_free(r[i].b);
@@ -1374,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
r[i].b = new_nodes[i];
}
- bch_btree_insert_node(b, op, keylist, NULL, NULL);
- BUG_ON(!bch_keylist_empty(keylist));
-
memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
r[nodes - 1].b = ERR_PTR(-EINTR);
trace_bcache_btree_gc_coalesce(nodes);
gc->nodes--;
+ bch_keylist_free(&keylist);
+
/* Invalidated our iterator */
return -EINTR;
out_nocoalesce:
closure_sync(&cl);
+ bch_keylist_free(&keylist);
- while ((k = bch_keylist_pop(keylist)))
+ while ((k = bch_keylist_pop(&keylist)))
if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked);
@@ -1401,13 +1449,49 @@ out_nocoalesce:
return 0;
}
+static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
+ struct btree *replace)
+{
+ struct keylist keys;
+ struct btree *n;
+
+ if (btree_check_reserve(b, NULL))
+ return 0;
+
+ n = btree_node_alloc_replacement(replace, NULL);
+
+ /* recheck reserve after allocating replacement node */
+ if (btree_check_reserve(b, NULL)) {
+ btree_node_free(n);
+ rw_unlock(true, n);
+ return 0;
+ }
+
+ bch_btree_node_write_sync(n);
+
+ bch_keylist_init(&keys);
+ bch_keylist_add(&keys, &n->key);
+
+ make_btree_freeing_key(replace, keys.top);
+ bch_keylist_push(&keys);
+
+ bch_btree_insert_node(b, op, &keys, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&keys));
+
+ btree_node_free(replace);
+ rw_unlock(true, n);
+
+ /* Invalidated our iterator */
+ return -EINTR;
+}
+
static unsigned btree_gc_count_keys(struct btree *b)
{
struct bkey *k;
struct btree_iter iter;
unsigned ret = 0;
- for_each_key_filter(b, k, &iter, bch_ptr_bad)
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
ret += bkey_u64s(k);
return ret;
@@ -1416,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b)
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
- unsigned i;
int ret = 0;
bool should_rewrite;
- struct btree *n;
struct bkey *k;
- struct keylist keys;
struct btree_iter iter;
struct gc_merge_info r[GC_MERGE_NODES];
- struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
+ struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
- bch_keylist_init(&keys);
- bch_btree_iter_init(b, &iter, &b->c->gc_done);
+ bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
- for (i = 0; i < GC_MERGE_NODES; i++)
- r[i].b = ERR_PTR(-EINTR);
+ for (i = r; i < r + ARRAY_SIZE(r); i++)
+ i->b = ERR_PTR(-EINTR);
while (1) {
- k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+ k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
if (k) {
- r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+ r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+ true);
if (IS_ERR(r->b)) {
ret = PTR_ERR(r->b);
break;
@@ -1443,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
r->keys = btree_gc_count_keys(r->b);
- ret = btree_gc_coalesce(b, op, &keys, gc, r);
+ ret = btree_gc_coalesce(b, op, gc, r);
if (ret)
break;
}
@@ -1453,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
if (!IS_ERR(last->b)) {
should_rewrite = btree_gc_mark_node(last->b, gc);
- if (should_rewrite &&
- !btree_check_reserve(b, NULL)) {
- n = btree_node_alloc_replacement(last->b,
- false);
-
- if (!IS_ERR_OR_NULL(n)) {
- bch_btree_node_write_sync(n);
- bch_keylist_add(&keys, &n->key);
-
- make_btree_freeing_key(last->b,
- keys.top);
- bch_keylist_push(&keys);
-
- btree_node_free(last->b);
-
- bch_btree_insert_node(b, op, &keys,
- NULL, NULL);
- BUG_ON(!bch_keylist_empty(&keys));
-
- rw_unlock(true, last->b);
- last->b = n;
-
- /* Invalidated our iterator */
- ret = -EINTR;
+ if (should_rewrite) {
+ ret = btree_gc_rewrite_node(b, op, last->b);
+ if (ret)
break;
- }
}
if (last->b->level) {
@@ -1493,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
* Must flush leaf nodes before gc ends, since replace
* operations aren't journalled
*/
+ mutex_lock(&last->b->write_lock);
if (btree_node_dirty(last->b))
bch_btree_node_write(last->b, writes);
+ mutex_unlock(&last->b->write_lock);
rw_unlock(true, last->b);
}
@@ -1507,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
}
}
- for (i = 0; i < GC_MERGE_NODES; i++)
- if (!IS_ERR_OR_NULL(r[i].b)) {
- if (btree_node_dirty(r[i].b))
- bch_btree_node_write(r[i].b, writes);
- rw_unlock(true, r[i].b);
+ for (i = r; i < r + ARRAY_SIZE(r); i++)
+ if (!IS_ERR_OR_NULL(i->b)) {
+ mutex_lock(&i->b->write_lock);
+ if (btree_node_dirty(i->b))
+ bch_btree_node_write(i->b, writes);
+ mutex_unlock(&i->b->write_lock);
+ rw_unlock(true, i->b);
}
- bch_keylist_free(&keys);
-
return ret;
}
@@ -1528,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
should_rewrite = btree_gc_mark_node(b, gc);
if (should_rewrite) {
- n = btree_node_alloc_replacement(b, false);
+ n = btree_node_alloc_replacement(b, NULL);
if (!IS_ERR_OR_NULL(n)) {
bch_btree_node_write_sync(n);
+
bch_btree_set_root(n);
btree_node_free(b);
rw_unlock(true, n);
@@ -1540,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
}
}
+ __bch_btree_mark_key(b->c, b->level + 1, &b->key);
+
if (b->level) {
ret = btree_gc_recurse(b, op, writes, gc);
if (ret)
@@ -1567,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)
for_each_cache(ca, c, i)
for_each_bucket(b, ca) {
- b->gc_gen = b->gen;
+ b->last_gc = b->gen;
if (!atomic_read(&b->pin)) {
- SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+ SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
}
}
@@ -1577,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
mutex_unlock(&c->bucket_lock);
}
-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
{
size_t available = 0;
struct bucket *b;
@@ -1590,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
c->gc_mark_valid = 1;
c->need_gc = 0;
- if (c->root)
- for (i = 0; i < KEY_PTRS(&c->root->key); i++)
- SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
- GC_MARK_METADATA);
-
for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
GC_MARK_METADATA);
@@ -1634,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)
SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
for_each_bucket(b, ca) {
- b->last_gc = b->gc_gen;
c->need_gc = max(c->need_gc, bucket_gc_gen(b));
- if (!atomic_read(&b->pin) &&
- GC_MARK(b) == GC_MARK_RECLAIMABLE) {
+ if (atomic_read(&b->pin))
+ continue;
+
+ BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+
+ if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
available++;
- if (!GC_SECTORS_USED(b))
- bch_bucket_add_unused(ca, b);
- }
}
}
@@ -1734,313 +1793,113 @@ int bch_gc_thread_start(struct cache_set *c)
/* Initial partial gc */
-static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
- unsigned long **seen)
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
{
int ret = 0;
- unsigned i;
struct bkey *k, *p = NULL;
- struct bucket *g;
struct btree_iter iter;
- for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
- for (i = 0; i < KEY_PTRS(k); i++) {
- if (!ptr_available(b->c, k, i))
- continue;
-
- g = PTR_BUCKET(b->c, k, i);
-
- if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
- seen[PTR_DEV(k, i)]) ||
- !ptr_stale(b->c, k, i)) {
- g->gen = PTR_GEN(k, i);
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+ bch_initial_mark_key(b->c, b->level, k);
- if (b->level)
- g->prio = BTREE_PRIO;
- else if (g->prio == BTREE_PRIO)
- g->prio = INITIAL_PRIO;
- }
- }
-
- btree_mark_key(b, k);
- }
+ bch_initial_mark_key(b->c, b->level + 1, &b->key);
if (b->level) {
- bch_btree_iter_init(b, &iter, NULL);
+ bch_btree_iter_init(&b->keys, &iter, NULL);
do {
- k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+ k = bch_btree_iter_next_filter(&iter, &b->keys,
+ bch_ptr_bad);
if (k)
btree_node_prefetch(b->c, k, b->level - 1);
if (p)
- ret = btree(check_recurse, p, b, op, seen);
+ ret = btree(check_recurse, p, b, op);
p = k;
} while (p && !ret);
}
- return 0;
+ return ret;
}
int bch_btree_check(struct cache_set *c)
{
- int ret = -ENOMEM;
- unsigned i;
- unsigned long *seen[MAX_CACHES_PER_SET];
struct btree_op op;
- memset(seen, 0, sizeof(seen));
bch_btree_op_init(&op, SHRT_MAX);
- for (i = 0; c->cache[i]; i++) {
- size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
- seen[i] = kmalloc(n, GFP_KERNEL);
- if (!seen[i])
- goto err;
-
- /* Disables the seen array until prio_read() uses it too */
- memset(seen[i], 0xFF, n);
- }
-
- ret = btree_root(check_recurse, c, &op, seen);
-err:
- for (i = 0; i < MAX_CACHES_PER_SET; i++)
- kfree(seen[i]);
- return ret;
-}
-
-/* Btree insertion */
-
-static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
-{
- struct bset *i = b->sets[b->nsets].data;
-
- memmove((uint64_t *) where + bkey_u64s(insert),
- where,
- (void *) end(i) - (void *) where);
-
- i->keys += bkey_u64s(insert);
- bkey_copy(where, insert);
- bch_bset_fix_lookup_table(b, where);
+ return btree_root(check_recurse, c, &op);
}
-static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
- struct btree_iter *iter,
- struct bkey *replace_key)
+void bch_initial_gc_finish(struct cache_set *c)
{
- void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
- {
- if (KEY_DIRTY(k))
- bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
- offset, -sectors);
- }
-
- uint64_t old_offset;
- unsigned old_size, sectors_found = 0;
-
- while (1) {
- struct bkey *k = bch_btree_iter_next(iter);
- if (!k ||
- bkey_cmp(&START_KEY(k), insert) >= 0)
- break;
-
- if (bkey_cmp(k, &START_KEY(insert)) <= 0)
- continue;
-
- old_offset = KEY_START(k);
- old_size = KEY_SIZE(k);
-
- /*
- * We might overlap with 0 size extents; we can't skip these
- * because if they're in the set we're inserting to we have to
- * adjust them so they don't overlap with the key we're
- * inserting. But we don't want to check them for replace
- * operations.
- */
-
- if (replace_key && KEY_SIZE(k)) {
- /*
- * k might have been split since we inserted/found the
- * key we're replacing
- */
- unsigned i;
- uint64_t offset = KEY_START(k) -
- KEY_START(replace_key);
-
- /* But it must be a subset of the replace key */
- if (KEY_START(k) < KEY_START(replace_key) ||
- KEY_OFFSET(k) > KEY_OFFSET(replace_key))
- goto check_failed;
-
- /* We didn't find a key that we were supposed to */
- if (KEY_START(k) > KEY_START(insert) + sectors_found)
- goto check_failed;
-
- if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
- KEY_DIRTY(k) != KEY_DIRTY(replace_key))
- goto check_failed;
-
- /* skip past gen */
- offset <<= 8;
-
- BUG_ON(!KEY_PTRS(replace_key));
-
- for (i = 0; i < KEY_PTRS(replace_key); i++)
- if (k->ptr[i] != replace_key->ptr[i] + offset)
- goto check_failed;
-
- sectors_found = KEY_OFFSET(k) - KEY_START(insert);
- }
+ struct cache *ca;
+ struct bucket *b;
+ unsigned i;
- if (bkey_cmp(insert, k) < 0 &&
- bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
- /*
- * We overlapped in the middle of an existing key: that
- * means we have to split the old key. But we have to do
- * slightly different things depending on whether the
- * old key has been written out yet.
- */
+ bch_btree_gc_finish(c);
- struct bkey *top;
-
- subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
-
- if (bkey_written(b, k)) {
- /*
- * We insert a new key to cover the top of the
- * old key, and the old key is modified in place
- * to represent the bottom split.
- *
- * It's completely arbitrary whether the new key
- * is the top or the bottom, but it has to match
- * up with what btree_sort_fixup() does - it
- * doesn't check for this kind of overlap, it
- * depends on us inserting a new key for the top
- * here.
- */
- top = bch_bset_search(b, &b->sets[b->nsets],
- insert);
- shift_keys(b, top, k);
- } else {
- BKEY_PADDED(key) temp;
- bkey_copy(&temp.key, k);
- shift_keys(b, k, &temp.key);
- top = bkey_next(k);
- }
+ mutex_lock(&c->bucket_lock);
- bch_cut_front(insert, top);
- bch_cut_back(&START_KEY(insert), k);
- bch_bset_fix_invalidated_key(b, k);
- return false;
- }
+ /*
+ * We need to put some unused buckets directly on the prio freelist in
+ * order to get the allocator thread started - it needs freed buckets in
+ * order to rewrite the prios and gens, and it needs to rewrite prios
+ * and gens in order to free buckets.
+ *
+ * This is only safe for buckets that have no live data in them, which
+ * there should always be some of.
+ */
+ for_each_cache(ca, c, i) {
+ for_each_bucket(b, ca) {
+ if (fifo_full(&ca->free[RESERVE_PRIO]))
+ break;
- if (bkey_cmp(insert, k) < 0) {
- bch_cut_front(insert, k);
- } else {
- if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
- old_offset = KEY_START(insert);
-
- if (bkey_written(b, k) &&
- bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
- /*
- * Completely overwrote, so we don't have to
- * invalidate the binary search tree
- */
- bch_cut_front(k, k);
- } else {
- __bch_cut_back(&START_KEY(insert), k);
- bch_bset_fix_invalidated_key(b, k);
+ if (bch_can_invalidate_bucket(ca, b) &&
+ !GC_MARK(b)) {
+ __bch_invalidate_one_bucket(ca, b);
+ fifo_push(&ca->free[RESERVE_PRIO],
+ b - ca->buckets);
}
}
-
- subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
- }
-
-check_failed:
- if (replace_key) {
- if (!sectors_found) {
- return true;
- } else if (sectors_found < KEY_SIZE(insert)) {
- SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
- (KEY_SIZE(insert) - sectors_found));
- SET_KEY_SIZE(insert, sectors_found);
- }
}
- return false;
+ mutex_unlock(&c->bucket_lock);
}
-static bool btree_insert_key(struct btree *b, struct btree_op *op,
- struct bkey *k, struct bkey *replace_key)
+/* Btree insertion */
+
+static bool btree_insert_key(struct btree *b, struct bkey *k,
+ struct bkey *replace_key)
{
- struct bset *i = b->sets[b->nsets].data;
- struct bkey *m, *prev;
- unsigned status = BTREE_INSERT_STATUS_INSERT;
+ unsigned status;
BUG_ON(bkey_cmp(k, &b->key) > 0);
- BUG_ON(b->level && !KEY_PTRS(k));
- BUG_ON(!b->level && !KEY_OFFSET(k));
-
- if (!b->level) {
- struct btree_iter iter;
-
- /*
- * bset_search() returns the first key that is strictly greater
- * than the search key - but for back merging, we want to find
- * the previous key.
- */
- prev = NULL;
- m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
-
- if (fix_overlapping_extents(b, k, &iter, replace_key)) {
- op->insert_collision = true;
- return false;
- }
-
- if (KEY_DIRTY(k))
- bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
- KEY_START(k), KEY_SIZE(k));
- while (m != end(i) &&
- bkey_cmp(k, &START_KEY(m)) > 0)
- prev = m, m = bkey_next(m);
+ status = bch_btree_insert_key(&b->keys, k, replace_key);
+ if (status != BTREE_INSERT_STATUS_NO_INSERT) {
+ bch_check_keys(&b->keys, "%u for %s", status,
+ replace_key ? "replace" : "insert");
- if (key_merging_disabled(b->c))
- goto insert;
-
- /* prev is in the tree, if we merge we're done */
- status = BTREE_INSERT_STATUS_BACK_MERGE;
- if (prev &&
- bch_bkey_try_merge(b, prev, k))
- goto merged;
-
- status = BTREE_INSERT_STATUS_OVERWROTE;
- if (m != end(i) &&
- KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
- goto copy;
-
- status = BTREE_INSERT_STATUS_FRONT_MERGE;
- if (m != end(i) &&
- bch_bkey_try_merge(b, k, m))
- goto copy;
- } else {
- BUG_ON(replace_key);
- m = bch_bset_search(b, &b->sets[b->nsets], k);
- }
-
-insert: shift_keys(b, m, k);
-copy: bkey_copy(m, k);
-merged:
- bch_check_keys(b, "%u for %s", status,
- replace_key ? "replace" : "insert");
+ trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+ status);
+ return true;
+ } else
+ return false;
+}
- if (b->level && !KEY_OFFSET(k))
- btree_current_write(b)->prio_blocked++;
+static size_t insert_u64s_remaining(struct btree *b)
+{
+ long ret = bch_btree_keys_u64s_remaining(&b->keys);
- trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
+ /*
+ * Might land in the middle of an existing extent and have to split it
+ */
+ if (b->keys.ops->is_extents)
+ ret -= KEY_MAX_U64S;
- return true;
+ return max(ret, 0L);
}
static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2048,21 +1907,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
struct bkey *replace_key)
{
bool ret = false;
- int oldsize = bch_count_data(b);
+ int oldsize = bch_count_data(&b->keys);
while (!bch_keylist_empty(insert_keys)) {
- struct bset *i = write_block(b);
struct bkey *k = insert_keys->keys;
- if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
- > btree_blocks(b))
+ if (bkey_u64s(k) > insert_u64s_remaining(b))
break;
if (bkey_cmp(k, &b->key) <= 0) {
if (!b->level)
bkey_put(b->c, k);
- ret |= btree_insert_key(b, op, k, replace_key);
+ ret |= btree_insert_key(b, k, replace_key);
bch_keylist_pop_front(insert_keys);
} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
BKEY_PADDED(key) temp;
@@ -2071,16 +1928,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
bch_cut_back(&b->key, &temp.key);
bch_cut_front(&b->key, insert_keys->keys);
- ret |= btree_insert_key(b, op, &temp.key, replace_key);
+ ret |= btree_insert_key(b, &temp.key, replace_key);
break;
} else {
break;
}
}
+ if (!ret)
+ op->insert_collision = true;
+
BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
- BUG_ON(bch_count_data(b) < oldsize);
+ BUG_ON(bch_count_data(&b->keys) < oldsize);
return ret;
}
@@ -2097,31 +1957,38 @@ static int btree_split(struct btree *b, struct btree_op *op,
closure_init_stack(&cl);
bch_keylist_init(&parent_keys);
- if (!b->level &&
- btree_check_reserve(b, op))
- return -EINTR;
+ if (btree_check_reserve(b, op)) {
+ if (!b->level)
+ return -EINTR;
+ else
+ WARN(1, "insufficient reserve for split\n");
+ }
- n1 = btree_node_alloc_replacement(b, true);
+ n1 = btree_node_alloc_replacement(b, op);
if (IS_ERR(n1))
goto err;
- split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+ split = set_blocks(btree_bset_first(n1),
+ block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
if (split) {
unsigned keys = 0;
- trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+ trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
- n2 = bch_btree_node_alloc(b->c, b->level, true);
+ n2 = bch_btree_node_alloc(b->c, op, b->level);
if (IS_ERR(n2))
goto err_free1;
if (!b->parent) {
- n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
+ n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
if (IS_ERR(n3))
goto err_free2;
}
+ mutex_lock(&n1->write_lock);
+ mutex_lock(&n2->write_lock);
+
bch_btree_insert_keys(n1, op, insert_keys, replace_key);
/*
@@ -2129,62 +1996,64 @@ static int btree_split(struct btree *b, struct btree_op *op,
* search tree yet
*/
- while (keys < (n1->sets[0].data->keys * 3) / 5)
- keys += bkey_u64s(node(n1->sets[0].data, keys));
+ while (keys < (btree_bset_first(n1)->keys * 3) / 5)
+ keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+ keys));
- bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
- keys += bkey_u64s(node(n1->sets[0].data, keys));
+ bkey_copy_key(&n1->key,
+ bset_bkey_idx(btree_bset_first(n1), keys));
+ keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
- n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
- n1->sets[0].data->keys = keys;
+ btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
+ btree_bset_first(n1)->keys = keys;
- memcpy(n2->sets[0].data->start,
- end(n1->sets[0].data),
- n2->sets[0].data->keys * sizeof(uint64_t));
+ memcpy(btree_bset_first(n2)->start,
+ bset_bkey_last(btree_bset_first(n1)),
+ btree_bset_first(n2)->keys * sizeof(uint64_t));
bkey_copy_key(&n2->key, &b->key);
bch_keylist_add(&parent_keys, &n2->key);
bch_btree_node_write(n2, &cl);
+ mutex_unlock(&n2->write_lock);
rw_unlock(true, n2);
} else {
- trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+ trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
+ mutex_lock(&n1->write_lock);
bch_btree_insert_keys(n1, op, insert_keys, replace_key);
}
bch_keylist_add(&parent_keys, &n1->key);
bch_btree_node_write(n1, &cl);
+ mutex_unlock(&n1->write_lock);
if (n3) {
/* Depth increases, make a new root */
+ mutex_lock(&n3->write_lock);
bkey_copy_key(&n3->key, &MAX_KEY);
bch_btree_insert_keys(n3, op, &parent_keys, NULL);
bch_btree_node_write(n3, &cl);
+ mutex_unlock(&n3->write_lock);
closure_sync(&cl);
bch_btree_set_root(n3);
rw_unlock(true, n3);
-
- btree_node_free(b);
} else if (!b->parent) {
/* Root filled up but didn't need to be split */
closure_sync(&cl);
bch_btree_set_root(n1);
-
- btree_node_free(b);
} else {
/* Split a non root node */
closure_sync(&cl);
make_btree_freeing_key(b, parent_keys.top);
bch_keylist_push(&parent_keys);
- btree_node_free(b);
-
bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
BUG_ON(!bch_keylist_empty(&parent_keys));
}
+ btree_node_free(b);
rw_unlock(true, n1);
bch_time_stats_update(&b->c->btree_split_time, start_time);
@@ -2199,7 +2068,7 @@ err_free1:
btree_node_free(n1);
rw_unlock(true, n1);
err:
- WARN(1, "bcache: btree split failed");
+ WARN(1, "bcache: btree split failed (level %u)", b->level);
if (n3 == ERR_PTR(-EAGAIN) ||
n2 == ERR_PTR(-EAGAIN) ||
@@ -2214,31 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
atomic_t *journal_ref,
struct bkey *replace_key)
{
+ struct closure cl;
+
BUG_ON(b->level && replace_key);
- if (should_split(b)) {
- if (current->bio_list) {
- op->lock = b->c->root->level + 1;
- return -EAGAIN;
- } else if (op->lock <= b->c->root->level) {
- op->lock = b->c->root->level + 1;
- return -EINTR;
- } else {
- /* Invalidated all iterators */
- return btree_split(b, op, insert_keys, replace_key) ?:
- -EINTR;
- }
- } else {
- BUG_ON(write_block(b) != b->sets[b->nsets].data);
+ closure_init_stack(&cl);
- if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
- if (!b->level)
- bch_btree_leaf_dirty(b, journal_ref);
- else
- bch_btree_node_write_sync(b);
- }
+ mutex_lock(&b->write_lock);
- return 0;
+ if (write_block(b) != btree_bset_last(b) &&
+ b->keys.last_set_unwritten)
+ bch_btree_init_next(b); /* just wrote a set */
+
+ if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
+ mutex_unlock(&b->write_lock);
+ goto split;
+ }
+
+ BUG_ON(write_block(b) != btree_bset_last(b));
+
+ if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+ if (!b->level)
+ bch_btree_leaf_dirty(b, journal_ref);
+ else
+ bch_btree_node_write(b, &cl);
+ }
+
+ mutex_unlock(&b->write_lock);
+
+ /* wait for btree node write if necessary, after unlock */
+ closure_sync(&cl);
+
+ return 0;
+split:
+ if (current->bio_list) {
+ op->lock = b->c->root->level + 1;
+ return -EAGAIN;
+ } else if (op->lock <= b->c->root->level) {
+ op->lock = b->c->root->level + 1;
+ return -EINTR;
+ } else {
+ /* Invalidated all iterators */
+ int ret = btree_split(b, op, insert_keys, replace_key);
+
+ if (bch_keylist_empty(insert_keys))
+ return 0;
+ else if (!ret)
+ return -EINTR;
+ return ret;
}
}
@@ -2368,9 +2260,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
- bch_btree_iter_init(b, &iter, from);
+ bch_btree_iter_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, b,
+ while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad))) {
ret = btree(map_nodes_recurse, k, b,
op, from, fn, flags);
@@ -2401,9 +2293,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
- bch_btree_iter_init(b, &iter, from);
+ bch_btree_iter_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+ while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
: btree(map_keys_recurse, k, b, op, from, fn, flags);
@@ -2624,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)
spin_lock_init(&buf->lock);
array_allocator_init(&buf->freelist);
}
-
-void bch_btree_exit(void)
-{
- if (btree_io_wq)
- destroy_workqueue(btree_io_wq);
-}
-
-int __init bch_btree_init(void)
-{
- btree_io_wq = create_singlethread_workqueue("bch_btree_io");
- if (!btree_io_wq)
- return -ENOMEM;
-
- return 0;
-}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 12c99b1a764..91dfa5e6968 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -127,20 +127,13 @@ struct btree {
struct cache_set *c;
struct btree *parent;
+ struct mutex write_lock;
+
unsigned long flags;
uint16_t written; /* would be nice to kill */
uint8_t level;
- uint8_t nsets;
- uint8_t page_order;
-
- /*
- * Set of sorted keys - the real btree node - plus a binary search tree
- *
- * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
- * to the memory we have allocated for this btree node. Additionally,
- * set[0]->data points to the entire btree node as it exists on disk.
- */
- struct bset_tree sets[MAX_BSETS];
+
+ struct btree_keys keys;
/* For outstanding btree writes, used as a lock - protects write_idx */
struct closure io;
@@ -180,44 +173,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
return b->writes + (btree_node_write_idx(b) ^ 1);
}
-static inline unsigned bset_offset(struct btree *b, struct bset *i)
-{
- return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
-}
-
static inline struct bset *btree_bset_first(struct btree *b)
{
- return b->sets->data;
+ return b->keys.set->data;
}
-static inline unsigned bset_byte_offset(struct btree *b, struct bset *i)
+static inline struct bset *btree_bset_last(struct btree *b)
{
- return ((size_t) i) - ((size_t) b->sets->data);
-}
-
-static inline unsigned bset_sector_offset(struct btree *b, struct bset *i)
-{
- return (((void *) i) - ((void *) btree_bset_first(b))) >> 9;
+ return bset_tree_last(&b->keys)->data;
}
static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
{
- return bset_sector_offset(b, i) >> b->c->block_bits;
-}
-
-static inline struct bset *write_block(struct btree *b)
-{
- return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
-}
-
-static inline bool bset_written(struct btree *b, struct bset_tree *t)
-{
- return t->data < write_block(b);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey *k)
-{
- return k < write_block(b)->start;
+ return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
}
static inline void set_gc_sectors(struct cache_set *c)
@@ -225,21 +193,6 @@ static inline void set_gc_sectors(struct cache_set *c)
atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
}
-static inline struct bkey *bch_btree_iter_init(struct btree *b,
- struct btree_iter *iter,
- struct bkey *search)
-{
- return __bch_btree_iter_init(b, iter, search, b->sets);
-}
-
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
- if (b->level)
- return bch_btree_ptr_invalid(b->c, k);
- else
- return bch_extent_ptr_invalid(b->c, k);
-}
-
void bkey_put(struct cache_set *c, struct bkey *k);
/* Looping macros */
@@ -250,14 +203,6 @@ void bkey_put(struct cache_set *c, struct bkey *k);
iter++) \
hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
-#define for_each_key_filter(b, k, iter, filter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next_filter((iter), b, filter));)
-
-#define for_each_key(b, k, iter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next(iter));)
-
/* Recursing down the btree */
struct btree_op {
@@ -292,12 +237,14 @@ static inline void rw_unlock(bool w, struct btree *b)
(w ? up_write : up_read)(&b->lock);
}
-void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
+void __bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
-struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
+struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
+ struct bkey *, int, bool);
int bch_btree_insert_check_key(struct btree *, struct btree_op *,
struct bkey *);
@@ -305,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
atomic_t *, struct bkey *);
int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
void bch_moving_gc(struct cache_set *);
int bch_btree_check(struct cache_set *);
-uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
+void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
static inline void wake_up_gc(struct cache_set *c)
{
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 7ef7461912b..a08e3eeac3c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
cl->fn = fn;
cl->wq = wq;
/* between atomic_dec() in closure_put() */
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
}
static inline void closure_queue(struct closure *cl)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 473e8d5a7fe..8b1f1d5c181 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,6 +8,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
+#include "extents.h"
#include <linux/console.h>
#include <linux/debugfs.h>
@@ -17,147 +18,82 @@
static struct dentry *debug;
-const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
-{
- unsigned i;
-
- for (i = 0; i < KEY_PTRS(k); i++)
- if (ptr_available(c, k, i)) {
- struct cache *ca = PTR_CACHE(c, k, i);
- size_t bucket = PTR_BUCKET_NR(c, k, i);
- size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
- if (KEY_SIZE(k) + r > c->sb.bucket_size)
- return "bad, length too big";
- if (bucket < ca->sb.first_bucket)
- return "bad, short offset";
- if (bucket >= ca->sb.nbuckets)
- return "bad, offset past end of device";
- if (ptr_stale(c, k, i))
- return "stale";
- }
-
- if (!bkey_cmp(k, &ZERO_KEY))
- return "bad, null key";
- if (!KEY_PTRS(k))
- return "bad, no pointers";
- if (!KEY_SIZE(k))
- return "zeroed key";
- return "";
-}
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
- unsigned i = 0;
- char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
-
- if (KEY_PTRS(k))
- while (1) {
- p("%llu:%llu gen %llu",
- PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
-
- if (++i == KEY_PTRS(k))
- break;
-
- p(", ");
- }
-
- p("]");
-
- if (KEY_DIRTY(k))
- p(" dirty");
- if (KEY_CSUM(k))
- p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
-#undef p
- return out - buf;
-}
-
#ifdef CONFIG_BCACHE_DEBUG
-static void dump_bset(struct btree *b, struct bset *i)
-{
- struct bkey *k, *next;
- unsigned j;
- char buf[80];
-
- for (k = i->start; k < end(i); k = next) {
- next = bkey_next(k);
-
- bch_bkey_to_text(buf, sizeof(buf), k);
- printk(KERN_ERR "block %u key %zi/%u: %s", bset_block_offset(b, i),
- (uint64_t *) k - i->d, i->keys, buf);
-
- for (j = 0; j < KEY_PTRS(k); j++) {
- size_t n = PTR_BUCKET_NR(b->c, k, j);
- printk(" bucket %zu", n);
-
- if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
- printk(" prio %i",
- PTR_BUCKET(b->c, k, j)->prio);
- }
-
- printk(" %s\n", bch_ptr_status(b->c, k));
-
- if (next < end(i) &&
- bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
- printk(KERN_ERR "Key skipped backwards\n");
- }
-}
-
-static void bch_dump_bucket(struct btree *b)
-{
- unsigned i;
-
- console_lock();
- for (i = 0; i <= b->nsets; i++)
- dump_bset(b, b->sets[i].data);
- console_unlock();
-}
+#define for_each_written_bset(b, start, i) \
+ for (i = (start); \
+ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
+ i->seq == (start)->seq; \
+ i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
+ block_bytes(b->c))
-void bch_btree_verify(struct btree *b, struct bset *new)
+void bch_btree_verify(struct btree *b)
{
struct btree *v = b->c->verify_data;
- struct closure cl;
- closure_init_stack(&cl);
+ struct bset *ondisk, *sorted, *inmemory;
+ struct bio *bio;
- if (!b->c->verify)
+ if (!b->c->verify || !b->c->verify_ondisk)
return;
down(&b->io_mutex);
mutex_lock(&b->c->verify_lock);
+ ondisk = b->c->verify_ondisk;
+ sorted = b->c->verify_data->keys.set->data;
+ inmemory = b->keys.set->data;
+
bkey_copy(&v->key, &b->key);
v->written = 0;
v->level = b->level;
+ v->keys.ops = b->keys.ops;
+
+ bio = bch_bbio_alloc(b->c);
+ bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev;
+ bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
+ bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
+ bch_bio_map(bio, sorted);
- bch_btree_node_read(v);
+ submit_bio_wait(REQ_META|READ_SYNC, bio);
+ bch_bbio_free(bio, b->c);
- if (new->keys != v->sets[0].data->keys ||
- memcmp(new->start,
- v->sets[0].data->start,
- (void *) end(new) - (void *) new->start)) {
- unsigned i, j;
+ memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
+
+ bch_btree_node_read_done(v);
+ sorted = v->keys.set->data;
+
+ if (inmemory->keys != sorted->keys ||
+ memcmp(inmemory->start,
+ sorted->start,
+ (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+ struct bset *i;
+ unsigned j;
console_lock();
- printk(KERN_ERR "*** original memory node:\n");
- for (i = 0; i <= b->nsets; i++)
- dump_bset(b, b->sets[i].data);
+ printk(KERN_ERR "*** in memory:\n");
+ bch_dump_bset(&b->keys, inmemory, 0);
- printk(KERN_ERR "*** sorted memory node:\n");
- dump_bset(b, new);
+ printk(KERN_ERR "*** read back in:\n");
+ bch_dump_bset(&v->keys, sorted, 0);
- printk(KERN_ERR "*** on disk node:\n");
- dump_bset(v, v->sets[0].data);
+ for_each_written_bset(b, ondisk, i) {
+ unsigned block = ((void *) i - (void *) ondisk) /
+ block_bytes(b->c);
+
+ printk(KERN_ERR "*** on disk block %u:\n", block);
+ bch_dump_bset(&b->keys, i, block);
+ }
- for (j = 0; j < new->keys; j++)
- if (new->d[j] != v->sets[0].data->d[j])
+ printk(KERN_ERR "*** block %zu not written\n",
+ ((void *) i - (void *) ondisk) / block_bytes(b->c));
+
+ for (j = 0; j < inmemory->keys; j++)
+ if (inmemory->d[j] != sorted->d[j])
break;
+ printk(KERN_ERR "b->written %u\n", b->written);
+
console_unlock();
panic("verify failed at %u\n", j);
}
@@ -204,74 +140,6 @@ out_put:
bio_put(check);
}
-int __bch_count_data(struct btree *b)
-{
- unsigned ret = 0;
- struct btree_iter iter;
- struct bkey *k;
-
- if (!b->level)
- for_each_key(b, k, &iter)
- ret += KEY_SIZE(k);
- return ret;
-}
-
-void __bch_check_keys(struct btree *b, const char *fmt, ...)
-{
- va_list args;
- struct bkey *k, *p = NULL;
- struct btree_iter iter;
- const char *err;
-
- for_each_key(b, k, &iter) {
- if (!b->level) {
- err = "Keys out of order";
- if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
- goto bug;
-
- if (bch_ptr_invalid(b, k))
- continue;
-
- err = "Overlapping keys";
- if (p && bkey_cmp(p, &START_KEY(k)) > 0)
- goto bug;
- } else {
- if (bch_ptr_bad(b, k))
- continue;
-
- err = "Duplicate keys";
- if (p && !bkey_cmp(p, k))
- goto bug;
- }
- p = k;
- }
-
- err = "Key larger than btree node key";
- if (p && bkey_cmp(p, &b->key) > 0)
- goto bug;
-
- return;
-bug:
- bch_dump_bucket(b);
-
- va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
-
- panic("bcache error: %s:\n", err);
-}
-
-void bch_btree_iter_next_check(struct btree_iter *iter)
-{
- struct bkey *k = iter->data->k, *next = bkey_next(k);
-
- if (next < iter->data->end &&
- bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
- bch_dump_bucket(iter->b);
- panic("Key skipped backwards\n");
- }
-}
-
#endif
#ifdef CONFIG_DEBUG_FS
@@ -318,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
if (!w)
break;
- bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+ bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
bch_keybuf_del(&i->keys, w);
}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 2ede60e3187..1f63c195d24 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -1,47 +1,30 @@
#ifndef _BCACHE_DEBUG_H
#define _BCACHE_DEBUG_H
-/* Btree/bkey debug printing */
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+struct bio;
+struct cached_dev;
+struct cache_set;
#ifdef CONFIG_BCACHE_DEBUG
-void bch_btree_verify(struct btree *, struct bset *);
+void bch_btree_verify(struct btree *);
void bch_data_verify(struct cached_dev *, struct bio *);
-int __bch_count_data(struct btree *);
-void __bch_check_keys(struct btree *, const char *, ...);
-void bch_btree_iter_next_check(struct btree_iter *);
-#define EBUG_ON(cond) BUG_ON(cond)
#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
#define key_merging_disabled(c) ((c)->key_merging_disabled)
#define bypass_torture_test(d) ((d)->bypass_torture_test)
#else /* DEBUG */
-static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_btree_verify(struct btree *b) {}
static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-static inline int __bch_count_data(struct btree *b) { return -1; }
-static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
-static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
-#define EBUG_ON(cond) do { if (cond); } while (0)
#define expensive_debug_checks(c) 0
#define key_merging_disabled(c) 0
#define bypass_torture_test(d) 0
#endif
-#define bch_count_data(b) \
- (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
-
-#define bch_check_keys(b, ...) \
-do { \
- if (expensive_debug_checks((b)->c)) \
- __bch_check_keys(b, __VA_ARGS__); \
-} while (0)
-
#ifdef CONFIG_DEBUG_FS
void bch_debug_init_cache_set(struct cache_set *);
#else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 00000000000..3a0de4cf977
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+
+static void sort_key_next(struct btree_iter *iter,
+ struct btree_iter_set *i)
+{
+ i->k = bkey_next(i->k);
+
+ if (i->k == i->end)
+ *i = iter->data[--iter->used];
+}
+
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ int64_t c = bkey_cmp(l.k, r.k);
+
+ return c ? c > 0 : l.k < r.k;
+}
+
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+ bucket < ca->sb.first_bucket ||
+ bucket >= ca->sb.nbuckets)
+ return true;
+ }
+
+ return false;
+}
+
+/* Common among btree and extent ptrs */
+
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size)
+ return "bad, length too big";
+ if (bucket < ca->sb.first_bucket)
+ return "bad, short offset";
+ if (bucket >= ca->sb.nbuckets)
+ return "bad, offset past end of device";
+ if (ptr_stale(c, k, i))
+ return "stale";
+ }
+
+ if (!bkey_cmp(k, &ZERO_KEY))
+ return "bad, null key";
+ if (!KEY_PTRS(k))
+ return "bad, no pointers";
+ if (!KEY_SIZE(k))
+ return "zeroed key";
+ return "";
+}
+
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+ unsigned i = 0;
+ char *out = buf, *end = buf + size;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+ p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ if (i)
+ p(", ");
+
+ if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+ p("check dev");
+ else
+ p("%llu:%llu gen %llu", PTR_DEV(k, i),
+ PTR_OFFSET(k, i), PTR_GEN(k, i));
+ }
+
+ p("]");
+
+ if (KEY_DIRTY(k))
+ p(" dirty");
+ if (KEY_CSUM(k))
+ p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+ struct btree *b = container_of(keys, struct btree, keys);
+ unsigned j;
+ char buf[80];
+
+ bch_extent_to_text(buf, sizeof(buf), k);
+ printk(" %s", buf);
+
+ for (j = 0; j < KEY_PTRS(k); j++) {
+ size_t n = PTR_BUCKET_NR(b->c, k, j);
+ printk(" bucket %zu", n);
+
+ if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+ printk(" prio %i",
+ PTR_BUCKET(b->c, k, j)->prio);
+ }
+
+ printk(" %s\n", bch_ptr_status(b->c, k));
+}
+
+/* Btree ptrs */
+
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+ char buf[80];
+
+ if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+ goto bad;
+
+ if (__ptr_invalid(c, k))
+ goto bad;
+
+ return false;
+bad:
+ bch_extent_to_text(buf, sizeof(buf), k);
+ cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+ return true;
+}
+
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ return __bch_btree_ptr_invalid(b->c, k);
+}
+
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+ unsigned i;
+ char buf[80];
+ struct bucket *g;
+
+ if (mutex_trylock(&b->c->bucket_lock)) {
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(b->c, k, i)) {
+ g = PTR_BUCKET(b->c, k, i);
+
+ if (KEY_DIRTY(k) ||
+ g->prio != BTREE_PRIO ||
+ (b->c->gc_mark_valid &&
+ GC_MARK(g) != GC_MARK_METADATA))
+ goto err;
+ }
+
+ mutex_unlock(&b->c->bucket_lock);
+ }
+
+ return false;
+err:
+ mutex_unlock(&b->c->bucket_lock);
+ bch_extent_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
+ buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g));
+ return true;
+}
+
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ unsigned i;
+
+ if (!bkey_cmp(k, &ZERO_KEY) ||
+ !KEY_PTRS(k) ||
+ bch_ptr_invalid(bk, k))
+ return true;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (!ptr_available(b->c, k, i) ||
+ ptr_stale(b->c, k, i))
+ return true;
+
+ if (expensive_debug_checks(b->c) &&
+ btree_ptr_bad_expensive(b, k))
+ return true;
+
+ return false;
+}
+
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+ struct bkey *insert,
+ struct btree_iter *iter,
+ struct bkey *replace_key)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+
+ if (!KEY_OFFSET(insert))
+ btree_current_write(b)->prio_blocked++;
+
+ return false;
+}
+
+const struct btree_keys_ops bch_btree_keys_ops = {
+ .sort_cmp = bch_key_sort_cmp,
+ .insert_fixup = bch_btree_ptr_insert_fixup,
+ .key_invalid = bch_btree_ptr_invalid,
+ .key_bad = bch_btree_ptr_bad,
+ .key_to_text = bch_extent_to_text,
+ .key_dump = bch_bkey_dump,
+};
+
+/* Extents */
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+ return c ? c > 0 : l.k < r.k;
+}
+
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+ struct bkey *tmp)
+{
+ while (iter->used > 1) {
+ struct btree_iter_set *top = iter->data, *i = top + 1;
+
+ if (iter->used > 2 &&
+ bch_extent_sort_cmp(i[0], i[1]))
+ i++;
+
+ if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+ break;
+
+ if (!KEY_SIZE(i->k)) {
+ sort_key_next(iter, i);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ continue;
+ }
+
+ if (top->k > i->k) {
+ if (bkey_cmp(top->k, i->k) >= 0)
+ sort_key_next(iter, i);
+ else
+ bch_cut_front(top->k, i->k);
+
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ } else {
+ /* can't happen because of comparison func */
+ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+ if (bkey_cmp(i->k, top->k) < 0) {
+ bkey_copy(tmp, top->k);
+
+ bch_cut_back(&START_KEY(i->k), tmp);
+ bch_cut_front(i->k, top->k);
+ heap_sift(iter, 0, bch_extent_sort_cmp);
+
+ return tmp;
+ } else {
+ bch_cut_back(&START_KEY(i->k), top->k);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void bch_subtract_dirty(struct bkey *k,
+ struct cache_set *c,
+ uint64_t offset,
+ int sectors)
+{
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+ offset, -sectors);
+}
+
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+ struct bkey *insert,
+ struct btree_iter *iter,
+ struct bkey *replace_key)
+{
+ struct cache_set *c = container_of(b, struct btree, keys)->c;
+
+ uint64_t old_offset;
+ unsigned old_size, sectors_found = 0;
+
+ BUG_ON(!KEY_OFFSET(insert));
+ BUG_ON(!KEY_SIZE(insert));
+
+ while (1) {
+ struct bkey *k = bch_btree_iter_next(iter);
+ if (!k)
+ break;
+
+ if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+ if (KEY_SIZE(k))
+ break;
+ else
+ continue;
+ }
+
+ if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+ continue;
+
+ old_offset = KEY_START(k);
+ old_size = KEY_SIZE(k);
+
+ /*
+ * We might overlap with 0 size extents; we can't skip these
+ * because if they're in the set we're inserting to we have to
+ * adjust them so they don't overlap with the key we're
+ * inserting. But we don't want to check them for replace
+ * operations.
+ */
+
+ if (replace_key && KEY_SIZE(k)) {
+ /*
+ * k might have been split since we inserted/found the
+ * key we're replacing
+ */
+ unsigned i;
+ uint64_t offset = KEY_START(k) -
+ KEY_START(replace_key);
+
+ /* But it must be a subset of the replace key */
+ if (KEY_START(k) < KEY_START(replace_key) ||
+ KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+ goto check_failed;
+
+ /* We didn't find a key that we were supposed to */
+ if (KEY_START(k) > KEY_START(insert) + sectors_found)
+ goto check_failed;
+
+ if (!bch_bkey_equal_header(k, replace_key))
+ goto check_failed;
+
+ /* skip past gen */
+ offset <<= 8;
+
+ BUG_ON(!KEY_PTRS(replace_key));
+
+ for (i = 0; i < KEY_PTRS(replace_key); i++)
+ if (k->ptr[i] != replace_key->ptr[i] + offset)
+ goto check_failed;
+
+ sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+ }
+
+ if (bkey_cmp(insert, k) < 0 &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+ /*
+ * We overlapped in the middle of an existing key: that
+ * means we have to split the old key. But we have to do
+ * slightly different things depending on whether the
+ * old key has been written out yet.
+ */
+
+ struct bkey *top;
+
+ bch_subtract_dirty(k, c, KEY_START(insert),
+ KEY_SIZE(insert));
+
+ if (bkey_written(b, k)) {
+ /*
+ * We insert a new key to cover the top of the
+ * old key, and the old key is modified in place
+ * to represent the bottom split.
+ *
+ * It's completely arbitrary whether the new key
+ * is the top or the bottom, but it has to match
+ * up with what btree_sort_fixup() does - it
+ * doesn't check for this kind of overlap, it
+ * depends on us inserting a new key for the top
+ * here.
+ */
+ top = bch_bset_search(b, bset_tree_last(b),
+ insert);
+ bch_bset_insert(b, top, k);
+ } else {
+ BKEY_PADDED(key) temp;
+ bkey_copy(&temp.key, k);
+ bch_bset_insert(b, k, &temp.key);
+ top = bkey_next(k);
+ }
+
+ bch_cut_front(insert, top);
+ bch_cut_back(&START_KEY(insert), k);
+ bch_bset_fix_invalidated_key(b, k);
+ goto out;
+ }
+
+ if (bkey_cmp(insert, k) < 0) {
+ bch_cut_front(insert, k);
+ } else {
+ if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+ old_offset = KEY_START(insert);
+
+ if (bkey_written(b, k) &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+ /*
+ * Completely overwrote, so we don't have to
+ * invalidate the binary search tree
+ */
+ bch_cut_front(k, k);
+ } else {
+ __bch_cut_back(&START_KEY(insert), k);
+ bch_bset_fix_invalidated_key(b, k);
+ }
+ }
+
+ bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
+ }
+
+check_failed:
+ if (replace_key) {
+ if (!sectors_found) {
+ return true;
+ } else if (sectors_found < KEY_SIZE(insert)) {
+ SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+ (KEY_SIZE(insert) - sectors_found));
+ SET_KEY_SIZE(insert, sectors_found);
+ }
+ }
+out:
+ if (KEY_DIRTY(insert))
+ bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+ KEY_START(insert),
+ KEY_SIZE(insert));
+
+ return false;
+}
+
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ char buf[80];
+
+ if (!KEY_SIZE(k))
+ return true;
+
+ if (KEY_SIZE(k) > KEY_OFFSET(k))
+ goto bad;
+
+ if (__ptr_invalid(b->c, k))
+ goto bad;
+
+ return false;
+bad:
+ bch_extent_to_text(buf, sizeof(buf), k);
+ cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+ return true;
+}
+
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+ unsigned ptr)
+{
+ struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+ char buf[80];
+
+ if (mutex_trylock(&b->c->bucket_lock)) {
+ if (b->c->gc_mark_valid &&
+ (!GC_MARK(g) ||
+ GC_MARK(g) == GC_MARK_METADATA ||
+ (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
+ goto err;
+
+ if (g->prio == BTREE_PRIO)
+ goto err;
+
+ mutex_unlock(&b->c->bucket_lock);
+ }
+
+ return false;
+err:
+ mutex_unlock(&b->c->bucket_lock);
+ bch_extent_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
+ buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g));
+ return true;
+}
+
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ struct bucket *g;
+ unsigned i, stale;
+
+ if (!KEY_PTRS(k) ||
+ bch_extent_invalid(bk, k))
+ return true;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (!ptr_available(b->c, k, i))
+ return true;
+
+ if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+ return false;
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ g = PTR_BUCKET(b->c, k, i);
+ stale = ptr_stale(b->c, k, i);
+
+ btree_bug_on(stale > 96, b,
+ "key too stale: %i, need_gc %u",
+ stale, b->c->need_gc);
+
+ btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+ b, "stale dirty pointer");
+
+ if (stale)
+ return true;
+
+ if (expensive_debug_checks(b->c) &&
+ bch_extent_bad_expensive(b, k, i))
+ return true;
+ }
+
+ return false;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+ return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+ ~((uint64_t)1 << 63);
+}
+
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ unsigned i;
+
+ if (key_merging_disabled(b->c))
+ return false;
+
+ for (i = 0; i < KEY_PTRS(l); i++)
+ if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+ PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+ return false;
+
+ /* Keys with no pointers aren't restricted to one bucket and could
+ * overflow KEY_SIZE
+ */
+ if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+ SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+ SET_KEY_SIZE(l, USHRT_MAX);
+
+ bch_cut_front(l, r);
+ return false;
+ }
+
+ if (KEY_CSUM(l)) {
+ if (KEY_CSUM(r))
+ l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+ else
+ SET_KEY_CSUM(l, 0);
+ }
+
+ SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+ SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+ return true;
+}
+
+const struct btree_keys_ops bch_extent_keys_ops = {
+ .sort_cmp = bch_extent_sort_cmp,
+ .sort_fixup = bch_extent_sort_fixup,
+ .insert_fixup = bch_extent_insert_fixup,
+ .key_invalid = bch_extent_invalid,
+ .key_bad = bch_extent_bad,
+ .key_merge = bch_extent_merge,
+ .key_to_text = bch_extent_to_text,
+ .key_dump = bch_bkey_dump,
+ .is_extents = true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 00000000000..e4e23409782
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+
+struct bkey;
+struct cache_set;
+
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 9d32d579082..59e82021b5b 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -95,7 +95,7 @@ reread: left = ca->sb.bucket_size - offset;
return ret;
}
- blocks = set_blocks(j, ca->set);
+ blocks = set_blocks(j, block_bytes(ca->set));
while (!list_empty(list)) {
i = list_first_entry(list,
@@ -237,8 +237,14 @@ bsearch:
for (i = 0; i < ca->sb.njournal_buckets; i++)
if (ja->seq[i] > seq) {
seq = ja->seq[i];
- ja->cur_idx = ja->discard_idx =
- ja->last_idx = i;
+ /*
+ * When journal_reclaim() goes to allocate for
+ * the first time, it'll use the bucket after
+ * ja->cur_idx
+ */
+ ja->cur_idx = i;
+ ja->last_idx = ja->discard_idx = (i + 1) %
+ ca->sb.njournal_buckets;
}
}
@@ -284,20 +290,15 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
for (k = i->j.start;
- k < end(&i->j);
+ k < bset_bkey_last(&i->j);
k = bkey_next(k)) {
unsigned j;
- for (j = 0; j < KEY_PTRS(k); j++) {
- struct bucket *g = PTR_BUCKET(c, k, j);
- atomic_inc(&g->pin);
+ for (j = 0; j < KEY_PTRS(k); j++)
+ if (ptr_available(c, k, j))
+ atomic_inc(&PTR_BUCKET(c, k, j)->pin);
- if (g->prio == BTREE_PRIO &&
- !ptr_stale(c, k, j))
- g->prio = INITIAL_PRIO;
- }
-
- __bch_btree_mark_key(c, 0, k);
+ bch_initial_mark_key(c, 0, k);
}
}
}
@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
struct keylist keylist;
- bch_keylist_init(&keylist);
-
list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -322,12 +321,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
n, i->j.seq - 1, start, end);
for (k = i->j.start;
- k < end(&i->j);
+ k < bset_bkey_last(&i->j);
k = bkey_next(k)) {
trace_bcache_journal_replay_key(k);
- bkey_copy(keylist.top, k);
- bch_keylist_push(&keylist);
+ bch_keylist_init_single(&keylist, k);
ret = bch_btree_insert(s, &keylist, i->pin, NULL);
if (ret)
@@ -383,16 +381,15 @@ retry:
b = best;
if (b) {
- rw_lock(true, b, b->level);
-
+ mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) {
- rw_unlock(true, b);
+ mutex_unlock(&b->write_lock);
/* We raced */
goto retry;
}
- bch_btree_node_write(b, NULL);
- rw_unlock(true, b);
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
}
}
@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j)
atomic_set(&fifo_back(&j->pin), 1);
j->cur->data->seq = ++j->seq;
+ j->cur->dirty = false;
j->cur->need_write = false;
j->cur->data->keys = 0;
@@ -579,7 +577,8 @@ static void journal_write_unlocked(struct closure *cl)
struct cache *ca;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
- unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+ unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+ c->sb.block_size;
struct bio *bio;
struct bio_list list;
@@ -595,7 +594,7 @@ static void journal_write_unlocked(struct closure *cl)
continue_at(cl, journal_write, system_wq);
}
- c->journal.blocks_free -= set_blocks(w->data, c);
+ c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
w->data->btree_level = c->root->level;
@@ -685,7 +684,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
struct journal_write *w = c->journal.cur;
sectors = __set_blocks(w->data, w->data->keys + nkeys,
- c) * c->sb.block_size;
+ block_bytes(c)) * c->sb.block_size;
if (sectors <= min_t(size_t,
c->journal.blocks_free * c->sb.block_size,
@@ -730,7 +729,10 @@ static void journal_write_work(struct work_struct *work)
struct cache_set,
journal.work);
spin_lock(&c->journal.lock);
- journal_try_write(c);
+ if (c->journal.cur->dirty)
+ journal_try_write(c);
+ else
+ spin_unlock(&c->journal.lock);
}
/*
@@ -751,7 +753,7 @@ atomic_t *bch_journal(struct cache_set *c,
w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
- memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+ memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
w->data->keys += bch_keylist_nkeys(keys);
ret = &fifo_back(&c->journal.pin);
@@ -760,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c,
if (parent) {
closure_wait(&w->wait, parent);
journal_try_write(c);
- } else if (!w->need_write) {
+ } else if (!w->dirty) {
+ w->dirty = true;
schedule_delayed_work(&c->journal.work,
msecs_to_jiffies(c->journal_delay_ms));
spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 9180c446507..e3c39457afb 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -95,6 +95,7 @@ struct journal_write {
struct cache_set *c;
struct closure_waitlist wait;
+ bool dirty;
bool need_write;
};
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9eb60d102de..cd7490311e5 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
moving_gc_keys);
unsigned i;
- for (i = 0; i < KEY_PTRS(k); i++) {
- struct bucket *g = PTR_BUCKET(c, k, i);
-
- if (GC_MOVE(g))
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i) &&
+ GC_MOVE(PTR_BUCKET(c, k, i)))
return true;
- }
return false;
}
@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl)
closure_call(&op->cl, bch_data_insert, NULL, cl);
}
- continue_at(cl, write_moving_finish, system_wq);
+ continue_at(cl, write_moving_finish, op->wq);
}
static void read_moving_submit(struct closure *cl)
@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl)
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
- continue_at(cl, write_moving, system_wq);
+ continue_at(cl, write_moving, io->op.wq);
}
static void read_moving(struct cache_set *c)
@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c)
io->w = w;
io->op.inode = KEY_INODE(&w->key);
io->op.c = c;
+ io->op.wq = c->moving_gc_wq;
moving_init(io);
bio = &io->bio.bio;
@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c)
ca->heap.used = 0;
for_each_bucket(b, ca) {
- if (!GC_SECTORS_USED(b))
+ if (GC_MARK(b) == GC_MARK_METADATA ||
+ !GC_SECTORS_USED(b) ||
+ GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+ atomic_read(&b->pin))
continue;
if (!heap_full(&ca->heap)) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index cce02f19e6c..15fff4f68a7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -12,11 +12,9 @@
#include "request.h"
#include "writeback.h"
-#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/hash.h>
#include <linux/random.h>
-#include "blk-cgroup.h"
#include <trace/events/bcache.h>
@@ -27,172 +25,13 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *);
-/* Cgroup interface */
-
-#ifdef CONFIG_CGROUP_BCACHE
-static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
-
-static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
-{
- struct cgroup_subsys_state *css;
- return cgroup &&
- (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
- ? container_of(css, struct bch_cgroup, css)
- : &bcache_default_cgroup;
-}
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
-{
- struct cgroup_subsys_state *css = bio->bi_css
- ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
- : task_subsys_state(current, bcache_subsys_id);
-
- return css
- ? container_of(css, struct bch_cgroup, css)
- : &bcache_default_cgroup;
-}
-
-static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
-{
- char tmp[1024];
- int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
- cgroup_to_bcache(cgrp)->cache_mode + 1);
-
- if (len < 0)
- return len;
-
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
-}
-
-static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
-{
- int v = bch_read_string_list(buf, bch_cache_modes);
- if (v < 0)
- return v;
-
- cgroup_to_bcache(cgrp)->cache_mode = v - 1;
- return 0;
-}
-
-static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
-{
- return cgroup_to_bcache(cgrp)->verify;
-}
-
-static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
-{
- cgroup_to_bcache(cgrp)->verify = val;
- return 0;
-}
-
-static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
- return atomic_read(&bcachecg->stats.cache_hits);
-}
-
-static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
- return atomic_read(&bcachecg->stats.cache_misses);
-}
-
-static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
- struct cftype *cft)
-{
- struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
- return atomic_read(&bcachecg->stats.cache_bypass_hits);
-}
-
-static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
- struct cftype *cft)
-{
- struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
- return atomic_read(&bcachecg->stats.cache_bypass_misses);
-}
-
-static struct cftype bch_files[] = {
- {
- .name = "cache_mode",
- .read = cache_mode_read,
- .write_string = cache_mode_write,
- },
- {
- .name = "verify",
- .read_u64 = bch_verify_read,
- .write_u64 = bch_verify_write,
- },
- {
- .name = "cache_hits",
- .read_u64 = bch_cache_hits_read,
- },
- {
- .name = "cache_misses",
- .read_u64 = bch_cache_misses_read,
- },
- {
- .name = "cache_bypass_hits",
- .read_u64 = bch_cache_bypass_hits_read,
- },
- {
- .name = "cache_bypass_misses",
- .read_u64 = bch_cache_bypass_misses_read,
- },
- { } /* terminate */
-};
-
-static void init_bch_cgroup(struct bch_cgroup *cg)
-{
- cg->cache_mode = -1;
-}
-
-static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
-{
- struct bch_cgroup *cg;
-
- cg = kzalloc(sizeof(*cg), GFP_KERNEL);
- if (!cg)
- return ERR_PTR(-ENOMEM);
- init_bch_cgroup(cg);
- return &cg->css;
-}
-
-static void bcachecg_destroy(struct cgroup *cgroup)
-{
- struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
- free_css_id(&bcache_subsys, &cg->css);
- kfree(cg);
-}
-
-struct cgroup_subsys bcache_subsys = {
- .create = bcachecg_create,
- .destroy = bcachecg_destroy,
- .subsys_id = bcache_subsys_id,
- .name = "bcache",
- .module = THIS_MODULE,
-};
-EXPORT_SYMBOL_GPL(bcache_subsys);
-#endif
-
static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
{
-#ifdef CONFIG_CGROUP_BCACHE
- int r = bch_bio_to_cgroup(bio)->cache_mode;
- if (r >= 0)
- return r;
-#endif
return BDEV_CACHE_MODE(&dc->sb);
}
static bool verify(struct cached_dev *dc, struct bio *bio)
{
-#ifdef CONFIG_CGROUP_BCACHE
- if (bch_bio_to_cgroup(bio)->verify)
- return true;
-#endif
return dc->verify;
}
@@ -249,12 +88,30 @@ static void bch_data_insert_keys(struct closure *cl)
atomic_dec_bug(journal_ref);
if (!op->insert_data_done)
- continue_at(cl, bch_data_insert_start, bcache_wq);
+ continue_at(cl, bch_data_insert_start, op->wq);
bch_keylist_free(&op->insert_keys);
closure_return(cl);
}
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+ struct cache_set *c)
+{
+ size_t oldsize = bch_keylist_nkeys(l);
+ size_t newsize = oldsize + u64s;
+
+ /*
+ * The journalling code doesn't handle the case where the keys to insert
+ * is bigger than an empty write: If we just return -ENOMEM here,
+ * bio_insert() and bio_invalidate() will insert the keys created so far
+ * and finish the rest when the keylist is empty.
+ */
+ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+ return -ENOMEM;
+
+ return __bch_keylist_realloc(l, u64s);
+}
+
static void bch_data_invalidate(struct closure *cl)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -267,7 +124,7 @@ static void bch_data_invalidate(struct closure *cl)
unsigned sectors = min(bio_sectors(bio),
1U << (KEY_SIZE_BITS - 1));
- if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
+ if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
goto out;
bio->bi_iter.bi_sector += sectors;
@@ -280,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl)
op->insert_data_done = true;
bio_put(bio);
out:
- continue_at(cl, bch_data_insert_keys, bcache_wq);
+ continue_at(cl, bch_data_insert_keys, op->wq);
}
static void bch_data_insert_error(struct closure *cl)
@@ -323,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
if (op->writeback)
op->error = error;
else if (!op->replace)
- set_closure_fn(cl, bch_data_insert_error, bcache_wq);
+ set_closure_fn(cl, bch_data_insert_error, op->wq);
else
set_closure_fn(cl, NULL, NULL);
}
@@ -336,14 +193,14 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
- if (op->bypass)
- return bch_data_invalidate(cl);
-
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
set_gc_sectors(op->c);
wake_up_gc(op->c);
}
+ if (op->bypass)
+ return bch_data_invalidate(cl);
+
/*
* Journal writes are marked REQ_FLUSH; if the original write was a
* flush, it'll wait on the journal write.
@@ -357,9 +214,9 @@ static void bch_data_insert_start(struct closure *cl)
/* 1 for the device pointer and 1 for the chksum */
if (bch_keylist_realloc(&op->insert_keys,
- 1 + (op->csum ? 1 : 0),
+ 3 + (op->csum ? 1 : 0),
op->c))
- continue_at(cl, bch_data_insert_keys, bcache_wq);
+ continue_at(cl, bch_data_insert_keys, op->wq);
k = op->insert_keys.top;
bkey_init(k);
@@ -396,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl)
} while (n != bio);
op->insert_data_done = true;
- continue_at(cl, bch_data_insert_keys, bcache_wq);
+ continue_at(cl, bch_data_insert_keys, op->wq);
err:
/* bch_alloc_sectors() blocks if s->writeback = true */
BUG_ON(op->writeback);
@@ -425,7 +282,7 @@ err:
bio_put(bio);
if (!bch_keylist_empty(&op->insert_keys))
- continue_at(cl, bch_data_insert_keys, bcache_wq);
+ continue_at(cl, bch_data_insert_keys, op->wq);
else
closure_return(cl);
}
@@ -807,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->iop.error = 0;
s->iop.flags = 0;
s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+ s->iop.wq = bcache_wq;
return s;
}
@@ -1186,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
static int flash_dev_cache_miss(struct btree *b, struct search *s,
struct bio *bio, unsigned sectors)
{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- /* Zero fill bio */
-
- bio_for_each_segment(bv, bio, iter) {
- unsigned j = min(bv.bv_len >> 9, sectors);
-
- void *p = kmap(bv.bv_page);
- memset(p + bv.bv_offset, 0, j << 9);
- kunmap(bv.bv_page);
+ unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
- sectors -= j;
- }
+ swap(bio->bi_iter.bi_size, bytes);
+ zero_fill_bio(bio);
+ swap(bio->bi_iter.bi_size, bytes);
- bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
+ bio_advance(bio, bytes);
if (!bio->bi_iter.bi_size)
return MAP_DONE;
@@ -1296,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d)
void bch_request_exit(void)
{
-#ifdef CONFIG_CGROUP_BCACHE
- cgroup_unload_subsys(&bcache_subsys);
-#endif
if (bch_search_cache)
kmem_cache_destroy(bch_search_cache);
}
@@ -1309,11 +1155,5 @@ int __init bch_request_init(void)
if (!bch_search_cache)
return -ENOMEM;
-#ifdef CONFIG_CGROUP_BCACHE
- cgroup_load_subsys(&bcache_subsys);
- init_bch_cgroup(&bcache_default_cgroup);
-
- cgroup_add_cftypes(&bcache_subsys, bch_files);
-#endif
return 0;
}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 39f21dbedc3..1ff36875c2b 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -1,12 +1,11 @@
#ifndef _BCACHE_REQUEST_H_
#define _BCACHE_REQUEST_H_
-#include <linux/cgroup.h>
-
struct data_insert_op {
struct closure cl;
struct cache_set *c;
struct bio *bio;
+ struct workqueue_struct *wq;
unsigned inode;
uint16_t write_point;
@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d);
extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
-struct bch_cgroup {
-#ifdef CONFIG_CGROUP_BCACHE
- struct cgroup_subsys_state css;
-#endif
- /*
- * We subtract one from the index into bch_cache_modes[], so that
- * default == -1; this makes it so the rest match up with d->cache_mode,
- * and we use d->cache_mode if cgrp->cache_mode < 0
- */
- short cache_mode;
- bool verify;
- struct cache_stat_collector stats;
-};
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
-
#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 84d0782f702..0ca072c20d0 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
mark_cache_stats(&dc->accounting.collector, hit, bypass);
mark_cache_stats(&c->accounting.collector, hit, bypass);
-#ifdef CONFIG_CGROUP_BCACHE
- mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
-#endif
}
void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 677a604e7f3..926ded8ccbf 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -9,6 +9,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
+#include "extents.h"
#include "request.h"
#include "writeback.h"
@@ -383,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
break;
}
- bch_bkey_to_text(buf, sizeof(buf), k);
+ bch_extent_to_text(buf, sizeof(buf), k);
pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -399,7 +400,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
{
struct bkey *k = &j->uuid_bucket;
- if (bch_btree_ptr_invalid(c, k))
+ if (__bch_btree_ptr_invalid(c, k))
return "bad uuid pointer";
bkey_copy(&c->uuid_bucket, k);
@@ -540,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
closure_sync(cl);
}
-#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
- fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
void bch_prio_write(struct cache *ca)
{
int i;
@@ -553,10 +551,6 @@ void bch_prio_write(struct cache *ca)
lockdep_assert_held(&ca->set->bucket_lock);
- for (b = ca->buckets;
- b < ca->buckets + ca->sb.nbuckets; b++)
- b->disk_gen = b->gen;
-
ca->disk_buckets->seq++;
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -600,14 +594,17 @@ void bch_prio_write(struct cache *ca)
mutex_lock(&ca->set->bucket_lock);
- ca->need_save_prio = 0;
-
/*
* Don't want the old priorities to get garbage collected until after we
* finish writing the new ones, and they're journalled
*/
- for (i = 0; i < prio_buckets(ca); i++)
+ for (i = 0; i < prio_buckets(ca); i++) {
+ if (ca->prio_last_buckets[i])
+ __bch_bucket_free(ca,
+ &ca->buckets[ca->prio_last_buckets[i]]);
+
ca->prio_last_buckets[i] = ca->prio_buckets[i];
+ }
}
static void prio_read(struct cache *ca, uint64_t bucket)
@@ -638,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
}
b->prio = le16_to_cpu(d->prio);
- b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+ b->gen = b->last_gc = d->gen;
}
}
@@ -842,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_PAGES;
q->limits.max_discard_sectors = UINT_MAX;
+ q->limits.discard_granularity = 512;
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
@@ -1351,9 +1349,11 @@ static void cache_set_free(struct closure *cl)
if (ca)
kobject_put(&ca->kobj);
+ bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
- free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
+ if (c->moving_gc_wq)
+ destroy_workqueue(c->moving_gc_wq);
if (c->bio_split)
bioset_free(c->bio_split);
if (c->fill_iter)
@@ -1394,14 +1394,21 @@ static void cache_set_flush(struct closure *cl)
list_add(&c->root->list, &c->btree_cache);
/* Should skip this if we're unregistering because of an error */
- list_for_each_entry(b, &c->btree_cache, list)
+ list_for_each_entry(b, &c->btree_cache, list) {
+ mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
- bch_btree_node_write(b, NULL);
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
for_each_cache(ca, c, i)
if (ca->alloc_thread)
kthread_stop(ca->alloc_thread);
+ cancel_delayed_work_sync(&c->journal.work);
+ /* flush last journal entry if needed */
+ c->journal.work.work.func(&c->journal.work.work);
+
closure_return(cl);
}
@@ -1477,25 +1484,20 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
- c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
+ c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
- c->sort_crit_factor = int_sqrt(c->btree_pages);
-
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
- init_waitqueue_head(&c->try_wait);
+ init_waitqueue_head(&c->btree_cache_wait);
init_waitqueue_head(&c->bucket_wait);
sema_init(&c->uuid_write_mutex, 1);
- mutex_init(&c->sort_lock);
- spin_lock_init(&c->sort_time.lock);
spin_lock_init(&c->btree_gc_time.lock);
spin_lock_init(&c->btree_split_time.lock);
spin_lock_init(&c->btree_read_time.lock);
- spin_lock_init(&c->try_harder_time.lock);
bch_moving_init_cache_set(c);
@@ -1519,11 +1521,12 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bucket_pages(c))) ||
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
- !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+ !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
- bch_open_buckets_alloc(c))
+ bch_open_buckets_alloc(c) ||
+ bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
@@ -1579,11 +1582,11 @@ static void run_cache_set(struct cache_set *c)
k = &j->btree_root;
err = "bad btree root";
- if (bch_btree_ptr_invalid(c, k))
+ if (__bch_btree_ptr_invalid(c, k))
goto err;
err = "error reading btree root";
- c->root = bch_btree_node_get(c, k, j->btree_level, true);
+ c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
if (IS_ERR_OR_NULL(c->root))
goto err;
@@ -1599,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
bch_journal_mark(c, &journal);
- bch_btree_gc_finish(c);
+ bch_initial_gc_finish(c);
pr_debug("btree_check() done");
/*
@@ -1641,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
ca->sb.d[j] = ca->sb.first_bucket + j;
}
- bch_btree_gc_finish(c);
+ bch_initial_gc_finish(c);
err = "error starting allocator thread";
for_each_cache(ca, c, i)
@@ -1658,12 +1661,14 @@ static void run_cache_set(struct cache_set *c)
goto err;
err = "cannot allocate new btree root";
- c->root = bch_btree_node_alloc(c, 0, true);
+ c->root = bch_btree_node_alloc(c, NULL, 0);
if (IS_ERR_OR_NULL(c->root))
goto err;
+ mutex_lock(&c->root->write_lock);
bkey_copy_key(&c->root->key, &MAX_KEY);
bch_btree_node_write(c->root, &cl);
+ mutex_unlock(&c->root->write_lock);
bch_btree_set_root(c->root);
rw_unlock(true, c->root);
@@ -1785,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
vfree(ca->buckets);
free_heap(&ca->heap);
- free_fifo(&ca->unused);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
@@ -1822,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
- !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
!init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
!(ca->buckets = vzalloc(sizeof(struct bucket) *
ca->sb.nbuckets)) ||
@@ -1837,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
for_each_bucket(b, ca)
atomic_set(&b->pin, 0);
- if (bch_cache_allocator_init(ca))
- goto err;
-
return 0;
-err:
- kobject_put(&ca->kobj);
- return -ENOMEM;
}
static void register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -1872,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
goto err;
+ mutex_lock(&bch_register_lock);
err = register_cache_set(ca);
+ mutex_unlock(&bch_register_lock);
+
if (err)
goto err;
@@ -1934,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE))
return -EBUSY;
- mutex_lock(&bch_register_lock);
-
if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
!(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
goto err;
@@ -1968,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!dc)
goto err_close;
+ mutex_lock(&bch_register_lock);
register_bdev(sb, sb_page, bdev, dc);
+ mutex_unlock(&bch_register_lock);
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
@@ -1981,7 +1981,6 @@ out:
put_page(sb_page);
kfree(sb);
kfree(path);
- mutex_unlock(&bch_register_lock);
module_put(THIS_MODULE);
return ret;
@@ -2060,7 +2059,6 @@ static void bcache_exit(void)
{
bch_debug_exit();
bch_request_exit();
- bch_btree_exit();
if (bcache_kobj)
kobject_put(bcache_kobj);
if (bcache_wq)
@@ -2090,7 +2088,6 @@ static int __init bcache_init(void)
if (!(bcache_wq = create_workqueue("bcache")) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
sysfs_create_files(bcache_kobj, files) ||
- bch_btree_init() ||
bch_request_init() ||
bch_debug_init(bcache_kobj))
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index d5dd282b176..b3ff57d61dd 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us);
sysfs_time_stats_attribute(btree_sort, ms, us);
sysfs_time_stats_attribute(btree_read, ms, us);
-sysfs_time_stats_attribute(try_harder, ms, us);
read_attribute(btree_nodes);
read_attribute(btree_used_percent);
@@ -400,81 +399,123 @@ static struct attribute *bch_flash_dev_files[] = {
};
KTYPE(bch_flash_dev);
-SHOW(__bch_cache_set)
+struct bset_stats_op {
+ struct btree_op op;
+ size_t nodes;
+ struct bset_stats stats;
+};
+
+static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
{
- unsigned root_usage(struct cache_set *c)
- {
- unsigned bytes = 0;
- struct bkey *k;
- struct btree *b;
- struct btree_iter iter;
+ struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
- goto lock_root;
+ op->nodes++;
+ bch_btree_keys_stats(&b->keys, &op->stats);
- do {
- rw_unlock(false, b);
-lock_root:
- b = c->root;
- rw_lock(false, b, b->level);
- } while (b != c->root);
+ return MAP_CONTINUE;
+}
- for_each_key_filter(b, k, &iter, bch_ptr_bad)
- bytes += bkey_bytes(k);
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+ struct bset_stats_op op;
+ int ret;
+
+ memset(&op, 0, sizeof(op));
+ bch_btree_op_init(&op.op, -1);
+ ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
+ if (ret < 0)
+ return ret;
+
+ return snprintf(buf, PAGE_SIZE,
+ "btree nodes: %zu\n"
+ "written sets: %zu\n"
+ "unwritten sets: %zu\n"
+ "written key bytes: %zu\n"
+ "unwritten key bytes: %zu\n"
+ "floats: %zu\n"
+ "failed: %zu\n",
+ op.nodes,
+ op.stats.sets_written, op.stats.sets_unwritten,
+ op.stats.bytes_written, op.stats.bytes_unwritten,
+ op.stats.floats, op.stats.failed);
+}
+
+static unsigned bch_root_usage(struct cache_set *c)
+{
+ unsigned bytes = 0;
+ struct bkey *k;
+ struct btree *b;
+ struct btree_iter iter;
+
+ goto lock_root;
+
+ do {
rw_unlock(false, b);
+lock_root:
+ b = c->root;
+ rw_lock(false, b, b->level);
+ } while (b != c->root);
- return (bytes * 100) / btree_bytes(c);
- }
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+ bytes += bkey_bytes(k);
- size_t cache_size(struct cache_set *c)
- {
- size_t ret = 0;
- struct btree *b;
+ rw_unlock(false, b);
- mutex_lock(&c->bucket_lock);
- list_for_each_entry(b, &c->btree_cache, list)
- ret += 1 << (b->page_order + PAGE_SHIFT);
+ return (bytes * 100) / btree_bytes(c);
+}
- mutex_unlock(&c->bucket_lock);
- return ret;
- }
+static size_t bch_cache_size(struct cache_set *c)
+{
+ size_t ret = 0;
+ struct btree *b;
- unsigned cache_max_chain(struct cache_set *c)
- {
- unsigned ret = 0;
- struct hlist_head *h;
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry(b, &c->btree_cache, list)
+ ret += 1 << (b->keys.page_order + PAGE_SHIFT);
- mutex_lock(&c->bucket_lock);
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
- for (h = c->bucket_hash;
- h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
- h++) {
- unsigned i = 0;
- struct hlist_node *p;
+static unsigned bch_cache_max_chain(struct cache_set *c)
+{
+ unsigned ret = 0;
+ struct hlist_head *h;
- hlist_for_each(p, h)
- i++;
+ mutex_lock(&c->bucket_lock);
- ret = max(ret, i);
- }
+ for (h = c->bucket_hash;
+ h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+ h++) {
+ unsigned i = 0;
+ struct hlist_node *p;
- mutex_unlock(&c->bucket_lock);
- return ret;
- }
+ hlist_for_each(p, h)
+ i++;
- unsigned btree_used(struct cache_set *c)
- {
- return div64_u64(c->gc_stats.key_bytes * 100,
- (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+ ret = max(ret, i);
}
- unsigned average_key_size(struct cache_set *c)
- {
- return c->gc_stats.nkeys
- ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
- : 0;
- }
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
+static unsigned bch_btree_used(struct cache_set *c)
+{
+ return div64_u64(c->gc_stats.key_bytes * 100,
+ (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
+
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+ return c->gc_stats.nkeys
+ ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+ : 0;
+}
+
+SHOW(__bch_cache_set)
+{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
sysfs_print(synchronous, CACHE_SYNC(&c->sb));
@@ -482,21 +523,20 @@ lock_root:
sysfs_hprint(bucket_size, bucket_bytes(c));
sysfs_hprint(block_size, block_bytes(c));
sysfs_print(tree_depth, c->root->level);
- sysfs_print(root_usage_percent, root_usage(c));
+ sysfs_print(root_usage_percent, bch_root_usage(c));
- sysfs_hprint(btree_cache_size, cache_size(c));
- sysfs_print(btree_cache_max_chain, cache_max_chain(c));
+ sysfs_hprint(btree_cache_size, bch_cache_size(c));
+ sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
- sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us);
+ sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
- sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
- sysfs_print(btree_used_percent, btree_used(c));
+ sysfs_print(btree_used_percent, bch_btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes);
- sysfs_hprint(average_key_size, average_key_size(c));
+ sysfs_hprint(average_key_size, bch_average_key_size(c));
sysfs_print(cache_read_races,
atomic_long_read(&c->cache_read_races));
@@ -667,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
sysfs_time_stats_attribute_list(btree_split, sec, us)
sysfs_time_stats_attribute_list(btree_sort, ms, us)
sysfs_time_stats_attribute_list(btree_read, ms, us)
- sysfs_time_stats_attribute_list(try_harder, ms, us)
&sysfs_btree_nodes,
&sysfs_btree_used_percent,
@@ -719,7 +758,9 @@ SHOW(__bch_cache)
int cmp(const void *l, const void *r)
{ return *((uint16_t *) r) - *((uint16_t *) l); }
- size_t n = ca->sb.nbuckets, i, unused, btree;
+ struct bucket *b;
+ size_t n = ca->sb.nbuckets, i;
+ size_t unused = 0, available = 0, dirty = 0, meta = 0;
uint64_t sum = 0;
/* Compute 31 quantiles */
uint16_t q[31], *p, *cached;
@@ -730,6 +771,17 @@ SHOW(__bch_cache)
return -ENOMEM;
mutex_lock(&ca->set->bucket_lock);
+ for_each_bucket(b, ca) {
+ if (!GC_SECTORS_USED(b))
+ unused++;
+ if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
+ available++;
+ if (GC_MARK(b) == GC_MARK_DIRTY)
+ dirty++;
+ if (GC_MARK(b) == GC_MARK_METADATA)
+ meta++;
+ }
+
for (i = ca->sb.first_bucket; i < n; i++)
p[i] = ca->buckets[i].prio;
mutex_unlock(&ca->set->bucket_lock);
@@ -744,10 +796,7 @@ SHOW(__bch_cache)
while (cached < p + n &&
*cached == BTREE_PRIO)
- cached++;
-
- btree = cached - p;
- n -= btree;
+ cached++, n--;
for (i = 0; i < n; i++)
sum += INITIAL_PRIO - cached[i];
@@ -763,12 +812,16 @@ SHOW(__bch_cache)
ret = scnprintf(buf, PAGE_SIZE,
"Unused: %zu%%\n"
+ "Clean: %zu%%\n"
+ "Dirty: %zu%%\n"
"Metadata: %zu%%\n"
"Average: %llu\n"
"Sectors per Q: %zu\n"
"Quantiles: [",
unused * 100 / (size_t) ca->sb.nbuckets,
- btree * 100 / (size_t) ca->sb.nbuckets, sum,
+ available * 100 / (size_t) ca->sb.nbuckets,
+ dirty * 100 / (size_t) ca->sb.nbuckets,
+ meta * 100 / (size_t) ca->sb.nbuckets, sum,
n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
for (i = 0; i < ARRAY_SIZE(q); i++)
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index adbc3df17a8..b7820b0d262 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1030c6020e9..ac7d0d1f70d 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -2,6 +2,7 @@
#ifndef _BCACHE_UTIL_H
#define _BCACHE_UTIL_H
+#include <linux/blkdev.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
#ifdef CONFIG_BCACHE_DEBUG
+#define EBUG_ON(cond) BUG_ON(cond)
#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
#else /* DEBUG */
+#define EBUG_ON(cond) do { if (cond); } while (0)
#define atomic_dec_bug(v) atomic_dec(v)
#define atomic_inc_bug(v, i) atomic_inc(v)
@@ -391,6 +394,11 @@ struct time_stats {
void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+static inline unsigned local_clock_us(void)
+{
+ return local_clock() >> 10;
+}
+
#define NSEC_PER_ns 1L
#define NSEC_PER_us NSEC_PER_USEC
#define NSEC_PER_ms NSEC_PER_MSEC