aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-01 19:43:53 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-01 19:43:53 -0700
commitb33ce442993865180292df2a314ee5251ba38b50 (patch)
tree004b703ea3cd19c932393086fe9cde96e8db8de7 /drivers/md
parent7a48837732f87a574ee3e1855927dc250117f565 (diff)
parente84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (diff)
Merge branch 'for-3.15/drivers' of git://git.kernel.dk/linux-block
Pull block driver update from Jens Axboe: "On top of the core pull request, here's the pull request for the driver related changes for 3.15. It contains: - Improvements for msi-x registration for block drivers (mtip32xx, skd, cciss, nvme) from Alexander Gordeev. - A round of cleanups and improvements for drbd from Andreas Gruenbacher and Rashika Kheria. - A round of clanups and improvements for bcache from Kent. - Removal of sleep_on() and friends in DAC960, ataflop, swim3 from Arnd Bergmann. - Bug fix for a bug in the mtip32xx async completion code from Sam Bradshaw. - Bug fix for accidentally bouncing IO on 32-bit platforms with mtip32xx from Felipe Franciosi" * 'for-3.15/drivers' of git://git.kernel.dk/linux-block: (103 commits) bcache: remove nested function usage bcache: Kill bucket->gc_gen bcache: Kill unused freelist bcache: Rework btree cache reserve handling bcache: Kill btree_io_wq bcache: btree locking rework bcache: Fix a race when freeing btree nodes bcache: Add a real GC_MARK_RECLAIMABLE bcache: Add bch_keylist_init_single() bcache: Improve priority_stats bcache: Better alloc tracepoints bcache: Kill dead cgroup code bcache: stop moving_gc marking buckets that can't be moved. bcache: Fix moving_pred() bcache: Fix moving_gc deadlocking with a foreground write bcache: Fix discard granularity bcache: Fix another bug recovering from unclean shutdown bcache: Fix a bug recovering from unclean shutdown bcache: Fix a journalling reclaim after recovery bug bcache: Fix a null ptr deref in journal replay ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Kconfig8
-rw-r--r--drivers/md/bcache/alloc.c173
-rw-r--r--drivers/md/bcache/bcache.h56
-rw-r--r--drivers/md/bcache/bset.c4
-rw-r--r--drivers/md/bcache/bset.h6
-rw-r--r--drivers/md/bcache/btree.c592
-rw-r--r--drivers/md/bcache/btree.h12
-rw-r--r--drivers/md/bcache/extents.c36
-rw-r--r--drivers/md/bcache/journal.c46
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c18
-rw-r--r--drivers/md/bcache/request.c201
-rw-r--r--drivers/md/bcache/request.h19
-rw-r--r--drivers/md/bcache/stats.c3
-rw-r--r--drivers/md/bcache/super.c64
-rw-r--r--drivers/md/bcache/sysfs.c155
-rw-r--r--drivers/md/bcache/trace.c2
17 files changed, 629 insertions, 767 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 2638417b19a..4d200883c50 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
Keeps all active closures in a linked list and provides a debugfs
interface to list them, which makes it possible to see asynchronous
operations that get stuck.
-
-# cgroup code needs to be updated:
-#
-#config CGROUP_BCACHE
-# bool "Cgroup controls for bcache"
-# depends on BCACHE && BLK_CGROUP
-# ---help---
-# TODO
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index c0d37d08244..443d03fbac4 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
- if (CACHE_SYNC(&ca->set->sb)) {
- ca->need_save_prio = max(ca->need_save_prio,
- bucket_disk_gen(b));
- WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
- }
-
return ret;
}
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
mutex_unlock(&c->bucket_lock);
}
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
static inline bool can_inc_bucket_gen(struct bucket *b)
{
- return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
- bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+ return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
}
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
- BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-
- if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
- unsigned i;
-
- for (i = 0; i < RESERVE_NONE; i++)
- if (!fifo_full(&ca->free[i]))
- goto add;
+ BUG_ON(!ca->set->gc_mark_valid);
- return false;
- }
-add:
- b->prio = 0;
-
- if (can_inc_bucket_gen(b) &&
- fifo_push(&ca->unused, b - ca->buckets)) {
- atomic_inc(&b->pin);
- return true;
- }
-
- return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
- return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+ return (!GC_MARK(b) ||
+ GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
!atomic_read(&b->pin) &&
can_inc_bucket_gen(b);
}
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{
+ lockdep_assert_held(&ca->set->bucket_lock);
+ BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+ if (GC_SECTORS_USED(b))
+ trace_bcache_invalidate(ca, b - ca->buckets);
+
bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO;
atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+ __bch_invalidate_one_bucket(ca, b);
+
fifo_push(&ca->free_inc, b - ca->buckets);
}
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
ca->heap.used = 0;
for_each_bucket(b, ca) {
- /*
- * If we fill up the unused list, if we then return before
- * adding anything to the free_inc list we'll skip writing
- * prios/gens and just go back to allocating from the unused
- * list:
- */
- if (fifo_full(&ca->unused))
- return;
-
- if (!can_invalidate_bucket(ca, b))
- continue;
-
- if (!GC_SECTORS_USED(b) &&
- bch_bucket_add_unused(ca, b))
+ if (!bch_can_invalidate_bucket(ca, b))
continue;
if (!heap_full(&ca->heap))
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
return;
}
- invalidate_one_bucket(ca, b);
+ bch_invalidate_one_bucket(ca, b);
}
}
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
b = ca->buckets + ca->fifo_last_bucket++;
- if (can_invalidate_bucket(ca, b))
- invalidate_one_bucket(ca, b);
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets) {
ca->invalidate_needs_gc = 1;
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
b = ca->buckets + n;
- if (can_invalidate_bucket(ca, b))
- invalidate_one_bucket(ca, b);
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets / 2) {
ca->invalidate_needs_gc = 1;
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
static void invalidate_buckets(struct cache *ca)
{
- if (ca->invalidate_needs_gc)
- return;
+ BUG_ON(ca->invalidate_needs_gc);
switch (CACHE_REPLACEMENT(&ca->sb)) {
case CACHE_REPLACEMENT_LRU:
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
invalidate_buckets_random(ca);
break;
}
-
- trace_bcache_alloc_invalidate(ca);
}
#define allocator_wait(ca, cond) \
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (1) {
+ while (!fifo_empty(&ca->free_inc)) {
long bucket;
- if ((!atomic_read(&ca->set->prio_blocked) ||
- !CACHE_SYNC(&ca->set->sb)) &&
- !fifo_empty(&ca->unused))
- fifo_pop(&ca->unused, bucket);
- else if (!fifo_empty(&ca->free_inc))
- fifo_pop(&ca->free_inc, bucket);
- else
- break;
+ fifo_pop(&ca->free_inc, bucket);
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
}
allocator_wait(ca, bch_allocator_push(ca, bucket));
+ wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait);
}
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
* them to the free_inc list:
*/
+retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid &&
- (ca->need_save_prio > 64 ||
- !ca->invalidate_needs_gc));
+ !ca->invalidate_needs_gc);
invalidate_buckets(ca);
/*
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
* new stuff to them:
*/
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
- if (CACHE_SYNC(&ca->set->sb) &&
- (!fifo_empty(&ca->free_inc) ||
- ca->need_save_prio > 64))
+ if (CACHE_SYNC(&ca->set->sb)) {
+ /*
+ * This could deadlock if an allocation with a btree
+ * node locked ever blocked - having the btree node
+ * locked would block garbage collection, but here we're
+ * waiting on garbage collection before we invalidate
+ * and free anything.
+ *
+ * But this should be safe since the btree code always
+ * uses btree_check_reserve() before allocating now, and
+ * if it fails it blocks without btree nodes locked.
+ */
+ if (!fifo_full(&ca->free_inc))
+ goto retry_invalidate;
+
bch_prio_write(ca);
+ }
}
}
+/* Allocation */
+
long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
{
DEFINE_WAIT(w);
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
fifo_pop(&ca->free[reserve], r))
goto out;
- if (!wait)
+ if (!wait) {
+ trace_bcache_alloc_fail(ca, reserve);
return -1;
+ }
do {
prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
out:
wake_up_process(ca->alloc_thread);
+ trace_bcache_alloc(ca, reserve);
+
if (expensive_debug_checks(ca->set)) {
size_t iter;
long i;
@@ -438,8 +423,6 @@ out:
BUG_ON(i == r);
fifo_for_each(i, &ca->free_inc, iter)
BUG_ON(i == r);
- fifo_for_each(i, &ca->unused, iter)
- BUG_ON(i == r);
}
b = ca->buckets + r;
@@ -461,17 +444,19 @@ out:
return r;
}
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+ SET_GC_MARK(b, 0);
+ SET_GC_SECTORS_USED(b, 0);
+}
+
void bch_bucket_free(struct cache_set *c, struct bkey *k)
{
unsigned i;
- for (i = 0; i < KEY_PTRS(k); i++) {
- struct bucket *b = PTR_BUCKET(c, k, i);
-
- SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
- SET_GC_SECTORS_USED(b, 0);
- bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
- }
+ for (i = 0; i < KEY_PTRS(k); i++)
+ __bch_bucket_free(PTR_CACHE(c, k, i),
+ PTR_BUCKET(c, k, i));
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
ca->alloc_thread = k;
return 0;
}
-
-int bch_cache_allocator_init(struct cache *ca)
-{
- /*
- * Reserve:
- * Prio/gen writes first
- * Then 8 for btree allocations
- * Then half for the moving garbage collector
- */
-#if 0
- ca->watermark[WATERMARK_PRIO] = 0;
-
- ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
-
- ca->watermark[WATERMARK_MOVINGGC] = 8 +
- ca->watermark[WATERMARK_METADATA];
-
- ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
- ca->watermark[WATERMARK_MOVINGGC];
-#endif
- return 0;
-}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index a4c7306ff43..82c9c5d3525 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -195,9 +195,7 @@ struct bucket {
atomic_t pin;
uint16_t prio;
uint8_t gen;
- uint8_t disk_gen;
uint8_t last_gc; /* Most out of date gen in the btree */
- uint8_t gc_gen;
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
};
@@ -207,9 +205,9 @@ struct bucket {
*/
BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
-#define GC_MARK_RECLAIMABLE 0
-#define GC_MARK_DIRTY 1
-#define GC_MARK_METADATA 2
+#define GC_MARK_RECLAIMABLE 1
+#define GC_MARK_DIRTY 2
+#define GC_MARK_METADATA 3
#define GC_SECTORS_USED_SIZE 13
#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
@@ -426,14 +424,9 @@ struct cache {
* their new gen to disk. After prio_write() finishes writing the new
* gens/prios, they'll be moved to the free list (and possibly discarded
* in the process)
- *
- * unused: GC found nothing pointing into these buckets (possibly
- * because all the data they contained was overwritten), so we only
- * need to discard them before they can be moved to the free list.
*/
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
- DECLARE_FIFO(long, unused);
size_t fifo_last_bucket;
@@ -443,12 +436,6 @@ struct cache {
DECLARE_HEAP(struct bucket *, heap);
/*
- * max(gen - disk_gen) for all buckets. When it gets too big we have to
- * call prio_write() to keep gens from wrapping.
- */
- uint8_t need_save_prio;
-
- /*
* If nonzero, we know we aren't going to find any buckets to invalidate
* until a gc finishes - otherwise we could pointlessly burn a ton of
* cpu
@@ -562,19 +549,16 @@ struct cache_set {
struct list_head btree_cache_freed;
/* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned bucket_cache_used;
+ unsigned btree_cache_used;
/*
* If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation. However, only one thread can be doing this
- * at a time, for obvious reasons - try_harder and try_wait are
- * basically a lock for this that we can wait on asynchronously. The
- * btree_root() macro releases the lock when it returns.
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
*/
- struct task_struct *try_harder;
- wait_queue_head_t try_wait;
- uint64_t try_harder_start;
+ wait_queue_head_t btree_cache_wait;
+ struct task_struct *btree_cache_alloc_lock;
/*
* When we free a btree node, we increment the gen of the bucket the
@@ -603,7 +587,7 @@ struct cache_set {
uint16_t min_prio;
/*
- * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+ * max(gen - last_gc) for all buckets. When it gets too big we have to gc
* to keep gens from wrapping around.
*/
uint8_t need_gc;
@@ -628,6 +612,8 @@ struct cache_set {
/* Number of moving GC bios in flight */
struct semaphore moving_in_flight;
+ struct workqueue_struct *moving_gc_wq;
+
struct btree *root;
#ifdef CONFIG_BCACHE_DEBUG
@@ -667,7 +653,6 @@ struct cache_set {
struct time_stats btree_gc_time;
struct time_stats btree_split_time;
struct time_stats btree_read_time;
- struct time_stats try_harder_time;
atomic_long_t cache_read_races;
atomic_long_t writeback_keys_done;
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
*/
static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
return b->gen - b->last_gc;
}
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
- return b->gen - b->disk_gen;
-}
-
#define BUCKET_GC_GEN_MAX 96U
-#define BUCKET_DISK_GEN_MAX 64U
#define kobj_attribute_write(n, fn) \
static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
void bch_bucket_free(struct cache_set *, struct bkey *);
+long bch_bucket_alloc(struct cache *, unsigned, bool);
int __bch_bucket_alloc_set(struct cache_set *, unsigned,
struct bkey *, int, bool);
int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
-int bch_cache_allocator_init(struct cache *ca);
void bch_debug_exit(void);
int bch_debug_init(struct kobject *);
void bch_request_exit(void);
int bch_request_init(void);
-void bch_btree_exit(void);
-int bch_btree_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 3f74b4b0747..54541641530 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
for (k = i->start; k < bset_bkey_last(i); k = next) {
next = bkey_next(k);
- printk(KERN_ERR "block %u key %li/%u: ", set,
- (uint64_t *) k - i->d, i->keys);
+ printk(KERN_ERR "block %u key %u/%u: ", set,
+ (unsigned) ((u64 *) k - i->d), i->keys);
if (b->ops->key_dump)
b->ops->key_dump(b, k);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 003260f4ddf..5f6728d5d4d 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
l->top_p = l->keys_p = l->inline_keys;
}
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+ l->keys = k;
+ l->top = bkey_next(k);
+}
+
static inline void bch_keylist_push(struct keylist *l)
{
l->top = bkey_next(l->top);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5f9c2a665ca..7347b610096 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -68,15 +68,11 @@
* alloc_bucket() cannot fail. This should be true but is not completely
* obvious.
*
- * Make sure all allocations get charged to the root cgroup
- *
* Plugging?
*
* If data write is less than hard sector size of ssd, round up offset in open
* bucket to the next whole sector
*
- * Also lookup by cgroup in get_open_bucket()
- *
* Superblock needs to be fleshed out for multiple cache devices
*
* Add a sysfs tunable for the number of writeback IOs in flight
@@ -97,8 +93,6 @@
#define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
-static struct workqueue_struct *btree_io_wq;
-
#define insert_lock(s, b) ((b)->level <= (s)->lock)
/*
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
({ \
int _r, l = (b)->level - 1; \
bool _w = l <= (op)->lock; \
- struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
+ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
if (!IS_ERR(_child)) { \
_child->parent = (b); \
_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
} \
rw_unlock(_w, _b); \
+ bch_cannibalize_unlock(c); \
if (_r == -EINTR) \
schedule(); \
- bch_cannibalize_unlock(c); \
- if (_r == -ENOSPC) { \
- wait_event((c)->try_wait, \
- !(c)->try_harder); \
- _r = -EINTR; \
- } \
} while (_r == -EINTR); \
\
- finish_wait(&(c)->bucket_wait, &(op)->wait); \
+ finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
_r; \
})
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
}
+static void bch_btree_init_next(struct btree *b)
+{
+ /* If not a leaf node, always sort */
+ if (b->level && b->keys.nsets)
+ bch_btree_sort(&b->keys, &b->c->sort);
+ else
+ bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+ if (b->written < btree_blocks(b))
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->sb));
+
+}
+
/* Btree key manipulation */
void bkey_put(struct cache_set *c, struct bkey *k)
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
btree_complete_write(b, w);
if (btree_node_dirty(b))
- queue_delayed_work(btree_io_wq, &b->work,
- msecs_to_jiffies(30000));
+ schedule_delayed_work(&b->work, 30 * HZ);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
}
}
-void bch_btree_node_write(struct btree *b, struct closure *parent)
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
{
struct bset *i = btree_bset_last(b);
+ lockdep_assert_held(&b->write_lock);
+
trace_bcache_btree_write(b);
BUG_ON(current->bio_list);
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
b->written += set_blocks(i, block_bytes(b->c));
+}
- /* If not a leaf node, always sort */
- if (b->level && b->keys.nsets)
- bch_btree_sort(&b->keys, &b->c->sort);
- else
- bch_btree_sort_lazy(&b->keys, &b->c->sort);
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+ unsigned nsets = b->keys.nsets;
+
+ lockdep_assert_held(&b->lock);
+
+ __bch_btree_node_write(b, parent);
/*
* do verify if there was more than one set initially (i.e. we did a
* sort) and we sorted down to a single set:
*/
- if (i != b->keys.set->data && !b->keys.nsets)
+ if (nsets && !b->keys.nsets)
bch_btree_verify(b);
- if (b->written < btree_blocks(b))
- bch_bset_init_next(&b->keys, write_block(b),
- bset_magic(&b->c->sb));
+ bch_btree_init_next(b);
}
static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
struct closure cl;
closure_init_stack(&cl);
+
+ mutex_lock(&b->write_lock);
bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
closure_sync(&cl);
}
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
{
struct btree *b = container_of(to_delayed_work(w), struct btree, work);
- rw_lock(true, b, b->level);
-
+ mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
- bch_btree_node_write(b, NULL);
- rw_unlock(true, b);
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
}
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
struct bset *i = btree_bset_last(b);
struct btree_write *w = btree_current_write(b);
+ lockdep_assert_held(&b->write_lock);
+
BUG_ON(!b->written);
BUG_ON(!i->keys);
if (!btree_node_dirty(b))
- queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+ schedule_delayed_work(&b->work, 30 * HZ);
set_btree_node_dirty(b);
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
#define mca_reserve(c) (((c->root && c->root->level) \
? c->root->level : 1) * 8 + 16)
#define mca_can_free(c) \
- max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+ max_t(int, 0, c->btree_cache_used - mca_reserve(c))
static void mca_data_free(struct btree *b)
{
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
bch_btree_keys_free(&b->keys);
- b->c->bucket_cache_used--;
+ b->c->btree_cache_used--;
list_move(&b->list, &b->c->btree_cache_freed);
}
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
ilog2(b->c->btree_pages),
btree_order(k)),
gfp)) {
- b->c->bucket_cache_used++;
+ b->c->btree_cache_used++;
list_move(&b->list, &b->c->btree_cache);
} else {
list_move(&b->list, &b->c->btree_cache_freed);
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock);
+ mutex_init(&b->write_lock);
+ lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list);
INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
up(&b->io_mutex);
}
+ mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
- bch_btree_node_write_sync(b);
+ __bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
+ closure_sync(&cl);
/* wait for any in flight btree write */
down(&b->io_mutex);
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
if (c->shrinker_disabled)
return SHRINK_STOP;
- if (c->try_harder)
+ if (c->btree_cache_alloc_lock)
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
}
}
- for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+ for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
if (list_empty(&c->btree_cache))
goto out;
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
if (c->shrinker_disabled)
return 0;
- if (c->try_harder)
+ if (c->btree_cache_alloc_lock)
return 0;
return mca_can_free(c) * c->btree_pages;
@@ -819,17 +835,30 @@ out:
return b;
}
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+ struct task_struct *old;
+
+ old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+ if (old && old != current) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+ struct bkey *k)
{
struct btree *b;
trace_bcache_btree_cache_cannibalize(c);
- if (!c->try_harder) {
- c->try_harder = current;
- c->try_harder_start = local_clock();
- } else if (c->try_harder != current)
- return ERR_PTR(-ENOSPC);
+ if (mca_cannibalize_lock(c, op))
+ return ERR_PTR(-EINTR);
list_for_each_entry_reverse(b, &c->btree_cache, list)
if (!mca_reap(b, btree_order(k), false))
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
if (!mca_reap(b, btree_order(k), true))
return b;
+ WARN(1, "btree cache cannibalize failed\n");
return ERR_PTR(-ENOMEM);
}
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
*/
static void bch_cannibalize_unlock(struct cache_set *c)
{
- if (c->try_harder == current) {
- bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
- c->try_harder = NULL;
- wake_up(&c->try_wait);
+ if (c->btree_cache_alloc_lock == current) {
+ c->btree_cache_alloc_lock = NULL;
+ wake_up(&c->btree_cache_wait);
}
}
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level)
{
struct btree *b;
@@ -920,7 +950,7 @@ err:
if (b)
rw_unlock(true, b);
- b = mca_cannibalize(c, k);
+ b = mca_cannibalize(c, op, k);
if (!IS_ERR(b))
goto out;
@@ -936,8 +966,8 @@ err:
* The btree node will have either a read or a write lock held, depending on
* level and op->lock.
*/
-struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
- int level, bool write)
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level, bool write)
{
int i = 0;
struct btree *b;
@@ -951,7 +981,7 @@ retry:
return ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
- b = mca_alloc(c, k, level);
+ b = mca_alloc(c, op, k, level);
mutex_unlock(&c->bucket_lock);
if (!b)
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
struct btree *b;
mutex_lock(&c->bucket_lock);
- b = mca_alloc(c, k, level);
+ b = mca_alloc(c, NULL, k, level);
mutex_unlock(&c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) {
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
static void btree_node_free(struct btree *b)
{
- unsigned i;
-
trace_bcache_btree_node_free(b);
BUG_ON(b == b->c->root);
+ mutex_lock(&b->write_lock);
+
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
+ mutex_unlock(&b->write_lock);
+
cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock);
-
- for (i = 0; i < KEY_PTRS(&b->key); i++) {
- BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
-
- bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
- PTR_BUCKET(b->c, &b->key, i));
- }
-
bch_bucket_free(b->c, &b->key);
mca_bucket_free(b);
mutex_unlock(&b->c->bucket_lock);
}
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ int level)
{
BKEY_PADDED(key) k;
struct btree *b = ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
retry:
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.k