aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-22 19:02:52 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-22 19:02:52 -0700
commitd4c90b1b9fe907da0d310008e5a769b591a14399 (patch)
treed37589ab70ada2778d315a0ad24d6e68c8615af6 /drivers/md
parent3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff)
parent0878ae2db83a10894724cdeaba7ef9f1ac1c9ac8 (diff)
Merge branch 'for-3.11/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver bits from Jens Axboe: "As I mentioned in the core block pull request, due to real life circumstances the driver pull request would be late. Now it looks like -rc2 late... On the plus side, apart form the rsxx update, these are all things that I could argue could go in later in the cycle as they are fixes and not features. So even though things are late, it's not ALL bad. The pull request contains: - Updates to bcache, all bug fixes, from Kent. - A pile of drbd bug fixes (no big features this time!). - xen blk front/back fixes. - rsxx driver updates, some of them deferred form 3.10. So should be well cooked by now" * 'for-3.11/drivers' of git://git.kernel.dk/linux-block: (63 commits) bcache: Allocation kthread fixes bcache: Fix GC_SECTORS_USED() calculation bcache: Journal replay fix bcache: Shutdown fix bcache: Fix a sysfs splat on shutdown bcache: Advertise that flushes are supported bcache: check for allocation failures bcache: Fix a dumb race bcache: Use standard utility code bcache: Update email address bcache: Delete fuzz tester bcache: Document shrinker reserve better bcache: FUA fixes drbd: Allow online change of al-stripes and al-stripe-size drbd: Constants should be UPPERCASE drbd: Ignore the exit code of a fence-peer handler if it returns too late drbd: Fix rcu_read_lock balance on error path drbd: fix error return code in drbd_init() drbd: Do not sleep inside rcu bcache: Refresh usage docs ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c46
-rw-r--r--drivers/md/bcache/bcache.h61
-rw-r--r--drivers/md/bcache/bset.c56
-rw-r--r--drivers/md/bcache/bset.h4
-rw-r--r--drivers/md/bcache/btree.c451
-rw-r--r--drivers/md/bcache/btree.h35
-rw-r--r--drivers/md/bcache/closure.c6
-rw-r--r--drivers/md/bcache/debug.c178
-rw-r--r--drivers/md/bcache/debug.h11
-rw-r--r--drivers/md/bcache/io.c68
-rw-r--r--drivers/md/bcache/journal.c25
-rw-r--r--drivers/md/bcache/movinggc.c24
-rw-r--r--drivers/md/bcache/request.c197
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c171
-rw-r--r--drivers/md/bcache/sysfs.c68
-rw-r--r--drivers/md/bcache/trace.c47
-rw-r--r--drivers/md/bcache/util.c17
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h64
21 files changed, 869 insertions, 801 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08..e45f5575fd4 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,7 +63,10 @@
#include "bcache.h"
#include "btree.h"
+#include <linux/freezer.h>
+#include <linux/kthread.h>
#include <linux/random.h>
+#include <trace/events/bcache.h>
#define MAX_IN_FLIGHT_DISCARDS 8U
@@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w)
mutex_unlock(&ca->set->bucket_lock);
closure_wake_up(&ca->set->bucket_wait);
- wake_up(&ca->set->alloc_wait);
+ wake_up_process(ca->alloc_thread);
closure_put(&ca->set->cl);
}
@@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca)
break;
}
- pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
- fifo_used(&ca->free), ca->free.size,
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->unused), ca->unused.size);
+ trace_bcache_alloc_invalidate(ca);
}
#define allocator_wait(ca, cond) \
do { \
- DEFINE_WAIT(__wait); \
- \
while (1) { \
- prepare_to_wait(&ca->set->alloc_wait, \
- &__wait, TASK_INTERRUPTIBLE); \
+ set_current_state(TASK_INTERRUPTIBLE); \
if (cond) \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
- if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
- finish_wait(&ca->set->alloc_wait, &__wait); \
- closure_return(cl); \
- } \
+ if (kthread_should_stop()) \
+ return 0; \
\
+ try_to_freeze(); \
schedule(); \
mutex_lock(&(ca)->set->bucket_lock); \
} \
- \
- finish_wait(&ca->set->alloc_wait, &__wait); \
+ __set_current_state(TASK_RUNNING); \
} while (0)
-void bch_allocator_thread(struct closure *cl)
+static int bch_allocator_thread(void *arg)
{
- struct cache *ca = container_of(cl, struct cache, alloc);
+ struct cache *ca = arg;
mutex_lock(&ca->set->bucket_lock);
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
{
long r = -1;
again:
- wake_up(&ca->set->alloc_wait);
+ wake_up_process(ca->alloc_thread);
if (fifo_used(&ca->free) > ca->watermark[watermark] &&
fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
return r;
}
- pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
- atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
- fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+ trace_bcache_alloc_fail(ca);
if (cl) {
closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
/* Init */
+int bch_cache_allocator_start(struct cache *ca)
+{
+ struct task_struct *k = kthread_run(bch_allocator_thread,
+ ca, "bcache_allocator");
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ ca->alloc_thread = k;
+ return 0;
+}
+
void bch_cache_allocator_exit(struct cache *ca)
{
struct discard *d;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4a..b39f6f0b45f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
#include <linux/bio.h>
-#include <linux/blktrace_api.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
struct keybuf {
- keybuf_pred_fn *key_predicate;
-
struct bkey last_scanned;
spinlock_t lock;
@@ -437,9 +434,12 @@ struct bcache_device {
/* If nonzero, we're detaching/unregistering from cache set */
atomic_t detaching;
+ int flush_done;
+
+ uint64_t nr_stripes;
+ unsigned stripe_size_bits;
+ atomic_t *stripe_sectors_dirty;
- atomic_long_t sectors_dirty;
- unsigned long sectors_dirty_gc;
unsigned long sectors_dirty_last;
long sectors_dirty_derivative;
@@ -531,6 +531,7 @@ struct cached_dev {
unsigned sequential_merge:1;
unsigned verify:1;
+ unsigned partial_stripes_expensive:1;
unsigned writeback_metadata:1;
unsigned writeback_running:1;
unsigned char writeback_percent;
@@ -565,8 +566,7 @@ struct cache {
unsigned watermark[WATERMARK_MAX];
- struct closure alloc;
- struct workqueue_struct *alloc_workqueue;
+ struct task_struct *alloc_thread;
struct closure prio;
struct prio_set *disk_buckets;
@@ -664,13 +664,9 @@ struct gc_stat {
* CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
* we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
* flushing dirty data).
- *
- * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
- * the allocation thread.
*/
#define CACHE_SET_UNREGISTERING 0
#define CACHE_SET_STOPPING 1
-#define CACHE_SET_STOPPING_2 2
struct cache_set {
struct closure cl;
@@ -703,9 +699,6 @@ struct cache_set {
/* For the btree cache */
struct shrinker shrink;
- /* For the allocator itself */
- wait_queue_head_t alloc_wait;
-
/* For the btree cache and anything allocation related */
struct mutex bucket_lock;
@@ -823,10 +816,9 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - this is a single element mempool for btree_read_work()
+ * on the stack - have to dynamically allocate them
*/
- struct mutex fill_lock;
- struct btree_iter *fill_iter;
+ mempool_t *fill_iter;
/*
* btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +826,7 @@ struct cache_set {
*/
struct mutex sort_lock;
struct bset *sort;
+ unsigned sort_crit_factor;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
@@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void)
return local_clock() >> 10;
}
-#define MAX_BSETS 4U
-
#define BTREE_PRIO USHRT_MAX
#define INITIAL_PRIO 32768
@@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}
-/* Blktrace macros */
-
-#define blktrace_msg(c, fmt, ...) \
-do { \
- struct request_queue *q = bdev_get_queue(c->bdev); \
- if (q) \
- blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define blktrace_msg_all(s, fmt, ...) \
-do { \
- struct cache *_c; \
- unsigned i; \
- for_each_cache(_c, (s), i) \
- blktrace_msg(_c, fmt, ##__VA_ARGS__); \
-} while (0)
-
static inline void cached_dev_put(struct cached_dev *dc)
{
if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
static struct kobj_attribute ksysfs_##n = \
__ATTR(n, S_IWUSR|S_IRUSR, show, store)
-/* Forward declarations */
+static inline void wake_up_allocators(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ for_each_cache(ca, c, i)
+ wake_up_process(ca->alloc_thread);
+}
-void bch_writeback_queue(struct cached_dev *);
-void bch_writeback_add(struct cached_dev *, unsigned);
+/* Forward declarations */
void bch_count_io_errors(struct cache *, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
-void bch_allocator_thread(struct closure *);
long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *);
struct cache_set *bch_cache_set_alloc(struct cache_sb *);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
-void bch_cached_dev_writeback_init(struct cached_dev *);
void bch_moving_init_cache_set(struct cache_set *);
+int bch_cache_allocator_start(struct cache *ca);
void bch_cache_allocator_exit(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 1d27d3af325..8010eed06a5 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
{
unsigned i;
+ char buf[80];
if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
return false;
bad:
- cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
+ bch_bkey_to_text(buf, sizeof(buf), k);
+ cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
return true;
}
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
#ifdef CONFIG_BCACHE_EDEBUG
bug:
mutex_unlock(&b->c->bucket_lock);
- btree_bug(b,
+
+ {
+ char buf[80];
+
+ bch_bkey_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
- pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
- g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+ buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+ }
return true;
#endif
}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
new->sets->size = 0;
}
+#define SORT_CRIT (4096 / sizeof(uint64_t))
+
void bch_btree_sort_lazy(struct btree *b)
{
- if (b->nsets) {
- unsigned i, j, keys = 0, total;
+ unsigned crit = SORT_CRIT;
+ int i;
- for (i = 0; i <= b->nsets; i++)
- keys += b->sets[i].data->keys;
-
- total = keys;
+ /* Don't sort if nothing to do */
+ if (!b->nsets)
+ goto out;
- for (j = 0; j < b->nsets; j++) {
- if (keys * 2 < total ||
- keys < 1000) {
- bch_btree_sort_partial(b, j);
- return;
- }
+ /* If not a leaf node, always sort */
+ if (b->level) {
+ bch_btree_sort(b);
+ return;
+ }
- keys -= b->sets[j].data->keys;
- }
+ for (i = b->nsets - 1; i >= 0; --i) {
+ crit *= b->c->sort_crit_factor;
- /* Must sort if b->nsets == 3 or we'll overflow */
- if (b->nsets >= (MAX_BSETS - 1) - b->level) {
- bch_btree_sort(b);
+ if (b->sets[i].data->keys < crit) {
+ bch_btree_sort_partial(b, i);
return;
}
}
+ /* Sort if we'd overflow */
+ if (b->nsets + 1 == MAX_BSETS) {
+ bch_btree_sort(b);
+ return;
+ }
+
+out:
bset_build_written_tree(b);
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 57a9cff4154..ae115a253d7 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,6 +1,8 @@
#ifndef _BCACHE_BSET_H
#define _BCACHE_BSET_H
+#include <linux/slab.h>
+
/*
* BKEYS:
*
@@ -142,6 +144,8 @@
/* Btree key comparison/iteration */
+#define MAX_BSETS 4U
+
struct btree_iter {
size_t size, used;
struct btree_iter_set {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e6..ee372884c40 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
#include "btree.h"
#include "debug.h"
#include "request.h"
+#include "writeback.h"
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
return crc ^ 0xffffffffffffffffULL;
}
-static void btree_bio_endio(struct bio *bio, int error)
+static void bch_btree_node_read_done(struct btree *b)
{
- struct closure *cl = bio->bi_private;
- struct btree *b = container_of(cl, struct btree, io.cl);
-
- if (error)
- set_btree_node_io_error(b);
-
- bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
- ? "writing btree" : "reading btree");
- closure_put(cl);
-}
-
-static void btree_bio_init(struct btree *b)
-{
- BUG_ON(b->bio);
- b->bio = bch_bbio_alloc(b->c);
-
- b->bio->bi_end_io = btree_bio_endio;
- b->bio->bi_private = &b->io.cl;
-}
-
-void bch_btree_read_done(struct closure *cl)
-{
- struct btree *b = container_of(cl, struct btree, io.cl);
- struct bset *i = b->sets[0].data;
- struct btree_iter *iter = b->c->fill_iter;
const char *err = "bad btree header";
- BUG_ON(b->nsets || b->written);
-
- bch_bbio_free(b->bio, b->c);
- b->bio = NULL;
+ struct bset *i = b->sets[0].data;
+ struct btree_iter *iter;
- mutex_lock(&b->c->fill_lock);
+ iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+ iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
- if (btree_node_io_error(b) ||
- !i->seq)
+ if (!i->seq)
goto err;
for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
if (b->written < btree_blocks(b))
bch_bset_init_next(b);
out:
-
- mutex_unlock(&b->c->fill_lock);
-
- spin_lock(&b->c->btree_read_time_lock);
- bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
- spin_unlock(&b->c->btree_read_time_lock);
-
- smp_wmb(); /* read_done is our write lock */
- set_btree_node_read_done(b);
-
- closure_return(cl);
+ mempool_free(iter, b->c->fill_iter);
+ return;
err:
set_btree_node_io_error(b);
bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
goto out;
}
-void bch_btree_read(struct btree *b)
+static void btree_node_read_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ closure_put(cl);
+}
+
+void bch_btree_node_read(struct btree *b)
{
- BUG_ON(b->nsets || b->written);
+ uint64_t start_time = local_clock();
+ struct closure cl;
+ struct bio *bio;
+
+ trace_bcache_btree_read(b);
+
+ closure_init_stack(&cl);
+
+ bio = bch_bbio_alloc(b->c);
+ bio->bi_rw = REQ_META|READ_SYNC;
+ bio->bi_size = KEY_SIZE(&b->key) << 9;
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = &cl;
+
+ bch_bio_map(bio, b->sets[0].data);
+
+ bch_submit_bbio(bio, b->c, &b->key, 0);
+ closure_sync(&cl);
- if (!closure_trylock(&b->io.cl, &b->c->cl))
- BUG();
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_btree_node_io_error(b);
- b->io_start_time = local_clock();
+ bch_bbio_free(bio, b->c);
- btree_bio_init(b);
- b->bio->bi_rw = REQ_META|READ_SYNC;
- b->bio->bi_size = KEY_SIZE(&b->key) << 9;
+ if (btree_node_io_error(b))
+ goto err;
- bch_bio_map(b->bio, b->sets[0].data);
+ bch_btree_node_read_done(b);
- pr_debug("%s", pbtree(b));
- trace_bcache_btree_read(b->bio);
- bch_submit_bbio(b->bio, b->c, &b->key, 0);
+ spin_lock(&b->c->btree_read_time_lock);
+ bch_time_stats_update(&b->c->btree_read_time, start_time);
+ spin_unlock(&b->c->btree_read_time_lock);
- continue_at(&b->io.cl, bch_btree_read_done, system_wq);
+ return;
+err:
+ bch_cache_set_error(b->c, "io error reading bucket %lu",
+ PTR_BUCKET_NR(b->c, &b->key, 0));
}
static void btree_complete_write(struct btree *b, struct btree_write *w)
{
if (w->prio_blocked &&
!atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
- wake_up(&b->c->alloc_wait);
+ wake_up_allocators(b->c);
if (w->journal) {
atomic_dec_bug(w->journal);
__closure_wake_up(&b->c->journal.wait);
}
- if (w->owner)
- closure_put(w->owner);
-
w->prio_blocked = 0;
w->journal = NULL;
- w->owner = NULL;
}
-static void __btree_write_done(struct closure *cl)
+static void __btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
closure_return(cl);
}
-static void btree_write_done(struct closure *cl)
+static void btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
__bio_for_each_segment(bv, b->bio, n, 0)
__free_page(bv->bv_page);
- __btree_write_done(cl);
+ __btree_node_write_done(cl);
}
-static void do_btree_write(struct btree *b)
+static void btree_node_write_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct btree *b = container_of(cl, struct btree, io.cl);
+
+ if (error)
+ set_btree_node_io_error(b);
+
+ bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+ closure_put(cl);
+}
+
+static void do_btree_node_write(struct btree *b)
{
struct closure *cl = &b->io.cl;
struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
i->version = BCACHE_BSET_VERSION;
i->csum = btree_csum_set(b, i);
- btree_bio_init(b);
- b->bio->bi_rw = REQ_META|WRITE_SYNC;
- b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+ BUG_ON(b->bio);
+ b->bio = bch_bbio_alloc(b->c);
+
+ b->bio->bi_end_io = btree_node_write_endio;
+ b->bio->bi_private = &b->io.cl;
+ b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
+ b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
bch_bio_map(b->bio, i);
+ /*
+ * If we're appending to a leaf node, we don't technically need FUA -
+ * this write just needs to be persisted before the next journal write,
+ * which will be marked FLUSH|FUA.
+ *
+ * Similarly if we're writing a new btree root - the pointer is going to
+ * be in the next journal entry.
+ *
+ * But if we're writing a new btree node (that isn't a root) or
+ * appending to a non leaf btree node, we need either FUA or a flush
+ * when we write the parent with the new pointer. FUA is cheaper than a
+ * flush, and writes appending to leaf nodes aren't blocking anything so
+ * just make all btree node writes FUA to keep things sane.
+ */
+
bkey_copy(&k.key, &b->key);
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
- if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
+ if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
memcpy(page_address(bv->bv_page),
base + j * PAGE_SIZE, PAGE_SIZE);
- trace_bcache_btree_write(b->bio);
bch_submit_bbio(b->bio, b->c, &k.key, 0);
- continue_at(cl, btree_write_done, NULL);
+ continue_at(cl, btree_node_write_done, NULL);
} else {
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
- trace_bcache_btree_write(b->bio);
bch_submit_bbio(b->bio, b->c, &k.key, 0);
closure_sync(cl);
- __btree_write_done(cl);
+ __btree_node_write_done(cl);
}
}
-static void __btree_write(struct btree *b)
+void bch_btree_node_write(struct btree *b, struct closure *parent)
{
struct bset *i = b->sets[b->nsets].data;
+ trace_bcache_btree_write(b);
+
BUG_ON(current->bio_list);
+ BUG_ON(b->written >= btree_blocks(b));
+ BUG_ON(b->written && !i->keys);
+ BUG_ON(b->sets->data->seq != i->seq);
+ bch_check_key_order(b, i);
- closure_lock(&b->io, &b->c->cl);
cancel_delayed_work(&b->work);
+ /* If caller isn't waiting for write, parent refcount is cache set */
+ closure_lock(&b->io, parent ?: &b->c->cl);
+
clear_bit(BTREE_NODE_dirty, &b->flags);
change_bit(BTREE_NODE_write_idx, &b->flags);
- bch_check_key_order(b, i);
- BUG_ON(b->written && !i->keys);
-
- do_btree_write(b);
-
- pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+ do_btree_node_write(b);
b->written += set_blocks(i, b->c);
atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
bch_bset_init_next(b);
}
-static void btree_write_work(struct work_struct *w)
+static void btree_node_write_work(struct work_struct *w)
{
struct btree *b = container_of(to_delayed_work(w), struct btree, work);
- down_write(&b->lock);
+ rw_lock(true, b, b->level);
if (btree_node_dirty(b))
- __btree_write(b);
- up_write(&b->lock);
+ bch_btree_node_write(b, NULL);
+ rw_unlock(true, b);
}
-void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
+static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
{
struct bset *i = b->sets[b->nsets].data;
struct btree_write *w = btree_current_write(b);
- BUG_ON(b->written &&
- (b->written >= btree_blocks(b) ||
- i->seq != b->sets[0].data->seq ||
- !i->keys));
+ BUG_ON(!b->written);
+ BUG_ON(!i->keys);
- if (!btree_node_dirty(b)) {
- set_btree_node_dirty(b);
- queue_delayed_work(btree_io_wq, &b->work,
- msecs_to_jiffies(30000));
- }
+ if (!btree_node_dirty(b))
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
- w->prio_blocked += b->prio_blocked;
- b->prio_blocked = 0;
+ set_btree_node_dirty(b);
- if (op && op->journal && !b->level) {
+ if (op && op->journal) {
if (w->journal &&
journal_pin_cmp(b->c, w, op)) {
atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
}
}
- if (current->bio_list)
- return;
-
/* Force write if set is too big */
- if (now ||
- b->level ||
- set_bytes(i) > PAGE_SIZE - 48) {
- if (op && now) {
- /* Must wait on multiple writes */
- BUG_ON(w->owner);
- w->owner = &op->cl;
- closure_get(&op->cl);
- }
-
- __btree_write(b);
- }
- BUG_ON(!b->written);
+ if (set_bytes(i) > PAGE_SIZE - 48 &&
+ !current->bio_list)
+ bch_btree_node_write(b, NULL);
}
/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock);
INIT_LIST_HEAD(&b->list);
- INIT_DELAYED_WORK(&b->work, btree_write_work);
+ INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
closure_init_unlocked(&b->io);
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
if (cl && btree_node_dirty(b))
- bch_btree_write(b, true, NULL);
+ bch_btree_node_write(b, NULL);
if (cl)
closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
else if (!mutex_trylock(&c->bucket_lock))
return -1;
+ /*
+ * It's _really_ critical that we don't free too many btree nodes - we
+ * have to always leave ourselves a reserve. The reserve is how we
+ * guarantee that allocating memory for a new btree node can always
+ * succeed, so that inserting keys into the btree can always succeed and
+ * IO can always make forward progress:
+ */
nr /= c->btree_pages;
nr = min_t(unsigned long, nr, mca_can_free(c));
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
int ret = -ENOMEM;
struct btree *i;
+ trace_bcache_btree_cache_cannibalize(c);
+
if (!cl)
return ERR_PTR(-ENOMEM);
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
return ERR_PTR(-EAGAIN);
}
- /* XXX: tracepoint */
c->try_harder = cl;
c->try_harder_start = local_clock();
retry:
@@ -905,6 +912,9 @@ retry:
b = mca_find(c, k);
if (!b) {
+ if (current->bio_list)
+ return ERR_PTR(-EAGAIN);
+
mutex_lock(&c->bucket_lock);
b = mca_alloc(c, k, level, &op->cl);
mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
if (IS_ERR(b))
return b;
- bch_btree_read(b);
+ bch_btree_node_read(b);
if (!write)
downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
for (; i <= b->nsets; i++)
prefetch(b->sets[i].data);
- if (!closure_wait_event(&b->io.wait, &op->cl,
- btree_node_read_done(b))) {
- rw_unlock(write, b);
- b = ERR_PTR(-EAGAIN);
- } else if (btree_node_io_error(b)) {
+ if (btree_node_io_error(b)) {
rw_unlock(write, b);
- b = ERR_PTR(-EIO);
- } else
- BUG_ON(!b->written);
+ return ERR_PTR(-EIO);
+ }
+
+ BUG_ON(!b->written);
return b;
}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
mutex_unlock(&c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) {
- bch_btree_read(b);
+ bch_btree_node_read(b);
rw_unlock(true, b);
}
}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
{
unsigned i;
+ trace_bcache_btree_node_free(b);
+
/*
* The BUG_ON() in btree_node_get() implies that we must have a write
* lock on parent to free or even invalidate a node
*/
BUG_ON(op->lock <= b->level);
BUG_ON(b == b->c->root);
- pr_debug("bucket %s", pbtree(b));
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
- if (b->prio_blocked &&
- !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
- wake_up(&b->c->alloc_wait);
-
- b->prio_blocked = 0;
-
cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
goto retry;
}
- set_btree_node_read_done(b);
b->accessed = 1;
bch_bset_init_next(b);
mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc(b);
return b;
err_free:
bch_bucket_free(c, &k.key);
__bkey_put(c, &k.key);
err:
mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc_fail(b);
return b;
}
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
gc->nkeys++;
gc->data += KEY_SIZE(k);
- if (KEY_DIRTY(k)) {
+ if (KEY_DIRTY(k))
gc->dirty += KEY_SIZE(k);
- if (d)
- d->sectors_dirty_gc += KEY_SIZE(k);
- }
}
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree