diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-22 19:02:52 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-22 19:02:52 -0700 |
commit | d4c90b1b9fe907da0d310008e5a769b591a14399 (patch) | |
tree | d37589ab70ada2778d315a0ad24d6e68c8615af6 /drivers/md | |
parent | 3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff) | |
parent | 0878ae2db83a10894724cdeaba7ef9f1ac1c9ac8 (diff) |
Merge branch 'for-3.11/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver bits from Jens Axboe:
"As I mentioned in the core block pull request, due to real life
circumstances the driver pull request would be late. Now it looks
like -rc2 late... On the plus side, apart form the rsxx update, these
are all things that I could argue could go in later in the cycle as
they are fixes and not features. So even though things are late, it's
not ALL bad.
The pull request contains:
- Updates to bcache, all bug fixes, from Kent.
- A pile of drbd bug fixes (no big features this time!).
- xen blk front/back fixes.
- rsxx driver updates, some of them deferred form 3.10. So should be
well cooked by now"
* 'for-3.11/drivers' of git://git.kernel.dk/linux-block: (63 commits)
bcache: Allocation kthread fixes
bcache: Fix GC_SECTORS_USED() calculation
bcache: Journal replay fix
bcache: Shutdown fix
bcache: Fix a sysfs splat on shutdown
bcache: Advertise that flushes are supported
bcache: check for allocation failures
bcache: Fix a dumb race
bcache: Use standard utility code
bcache: Update email address
bcache: Delete fuzz tester
bcache: Document shrinker reserve better
bcache: FUA fixes
drbd: Allow online change of al-stripes and al-stripe-size
drbd: Constants should be UPPERCASE
drbd: Ignore the exit code of a fence-peer handler if it returns too late
drbd: Fix rcu_read_lock balance on error path
drbd: fix error return code in drbd_init()
drbd: Do not sleep inside rcu
bcache: Refresh usage docs
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 46 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 61 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 56 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 451 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 35 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 178 | ||||
-rw-r--r-- | drivers/md/bcache/debug.h | 11 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 68 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 25 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 24 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 197 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 171 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 68 | ||||
-rw-r--r-- | drivers/md/bcache/trace.c | 47 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 17 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 133 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 64 |
21 files changed, 869 insertions, 801 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08..e45f5575fd4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,7 +63,10 @@ #include "bcache.h" #include "btree.h" +#include <linux/freezer.h> +#include <linux/kthread.h> #include <linux/random.h> +#include <trace/events/bcache.h> #define MAX_IN_FLIGHT_DISCARDS 8U @@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w) mutex_unlock(&ca->set->bucket_lock); closure_wake_up(&ca->set->bucket_wait); - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); closure_put(&ca->set->cl); } @@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca) break; } - pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", - fifo_used(&ca->free), ca->free.size, - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->unused), ca->unused.size); + trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ do { \ - DEFINE_WAIT(__wait); \ - \ while (1) { \ - prepare_to_wait(&ca->set->alloc_wait, \ - &__wait, TASK_INTERRUPTIBLE); \ + set_current_state(TASK_INTERRUPTIBLE); \ if (cond) \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ - finish_wait(&ca->set->alloc_wait, &__wait); \ - closure_return(cl); \ - } \ + if (kthread_should_stop()) \ + return 0; \ \ + try_to_freeze(); \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ - \ - finish_wait(&ca->set->alloc_wait, &__wait); \ + __set_current_state(TASK_RUNNING); \ } while (0) -void bch_allocator_thread(struct closure *cl) +static int bch_allocator_thread(void *arg) { - struct cache *ca = container_of(cl, struct cache, alloc); + struct cache *ca = arg; mutex_lock(&ca->set->bucket_lock); @@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) { long r = -1; again: - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); if (fifo_used(&ca->free) > ca->watermark[watermark] && fifo_pop(&ca->free, r)) { @@ -476,9 +471,7 @@ again: return r; } - pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", - atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + trace_bcache_alloc_fail(ca); if (cl) { closure_wait(&ca->set->bucket_wait, cl); @@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, /* Init */ +int bch_cache_allocator_start(struct cache *ca) +{ + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + + ca->alloc_thread = k; + return 0; +} + void bch_cache_allocator_exit(struct cache *ca) { struct discard *d; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4a..b39f6f0b45f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -178,7 +178,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ #include <linux/bio.h> -#include <linux/blktrace_api.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mutex.h> @@ -388,8 +387,6 @@ struct keybuf_key { typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); struct keybuf { - keybuf_pred_fn *key_predicate; - struct bkey last_scanned; spinlock_t lock; @@ -437,9 +434,12 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; + int flush_done; + + uint64_t nr_stripes; + unsigned stripe_size_bits; + atomic_t *stripe_sectors_dirty; - atomic_long_t sectors_dirty; - unsigned long sectors_dirty_gc; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -531,6 +531,7 @@ struct cached_dev { unsigned sequential_merge:1; unsigned verify:1; + unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; unsigned writeback_running:1; unsigned char writeback_percent; @@ -565,8 +566,7 @@ struct cache { unsigned watermark[WATERMARK_MAX]; - struct closure alloc; - struct workqueue_struct *alloc_workqueue; + struct task_struct *alloc_thread; struct closure prio; struct prio_set *disk_buckets; @@ -664,13 +664,9 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). - * - * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down - * the allocation thread. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 -#define CACHE_SET_STOPPING_2 2 struct cache_set { struct closure cl; @@ -703,9 +699,6 @@ struct cache_set { /* For the btree cache */ struct shrinker shrink; - /* For the allocator itself */ - wait_queue_head_t alloc_wait; - /* For the btree cache and anything allocation related */ struct mutex bucket_lock; @@ -823,10 +816,9 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - this is a single element mempool for btree_read_work() + * on the stack - have to dynamically allocate them */ - struct mutex fill_lock; - struct btree_iter *fill_iter; + mempool_t *fill_iter; /* * btree_sort() is a merge sort and requires temporary space - single @@ -834,6 +826,7 @@ struct cache_set { */ struct mutex sort_lock; struct bset *sort; + unsigned sort_crit_factor; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void) return local_clock() >> 10; } -#define MAX_BSETS 4U - #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768 @@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } -/* Blktrace macros */ - -#define blktrace_msg(c, fmt, ...) \ -do { \ - struct request_queue *q = bdev_get_queue(c->bdev); \ - if (q) \ - blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ -} while (0) - -#define blktrace_msg_all(s, fmt, ...) \ -do { \ - struct cache *_c; \ - unsigned i; \ - for_each_cache(_c, (s), i) \ - blktrace_msg(_c, fmt, ##__VA_ARGS__); \ -} while (0) - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) @@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) static struct kobj_attribute ksysfs_##n = \ __ATTR(n, S_IWUSR|S_IRUSR, show, store) -/* Forward declarations */ +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + for_each_cache(ca, c, i) + wake_up_process(ca->alloc_thread); +} -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *, unsigned); +/* Forward declarations */ void bch_count_io_errors(struct cache *, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, @@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -void bch_allocator_thread(struct closure *); long bch_bucket_alloc(struct cache *, unsigned, struct closure *); void bch_bucket_free(struct cache_set *, struct bkey *); @@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); +int bch_cache_allocator_start(struct cache *ca); void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1d27d3af325..8010eed06a5 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) { unsigned i; + char buf[80]; if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) goto bad; @@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) return false; bad: - cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); return true; } @@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) #ifdef CONFIG_BCACHE_EDEBUG bug: mutex_unlock(&b->c->bucket_lock); - btree_bug(b, + + { + char buf[80]; + + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + } return true; #endif } @@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) new->sets->size = 0; } +#define SORT_CRIT (4096 / sizeof(uint64_t)) + void bch_btree_sort_lazy(struct btree *b) { - if (b->nsets) { - unsigned i, j, keys = 0, total; + unsigned crit = SORT_CRIT; + int i; - for (i = 0; i <= b->nsets; i++) - keys += b->sets[i].data->keys; - - total = keys; + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; - for (j = 0; j < b->nsets; j++) { - if (keys * 2 < total || - keys < 1000) { - bch_btree_sort_partial(b, j); - return; - } + /* If not a leaf node, always sort */ + if (b->level) { + bch_btree_sort(b); + return; + } - keys -= b->sets[j].data->keys; - } + for (i = b->nsets - 1; i >= 0; --i) { + crit *= b->c->sort_crit_factor; - /* Must sort if b->nsets == 3 or we'll overflow */ - if (b->nsets >= (MAX_BSETS - 1) - b->level) { - bch_btree_sort(b); + if (b->sets[i].data->keys < crit) { + bch_btree_sort_partial(b, i); return; } } + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b); + return; + } + +out: bset_build_written_tree(b); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff4154..ae115a253d7 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,6 +1,8 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H +#include <linux/slab.h> + /* * BKEYS: * @@ -142,6 +144,8 @@ /* Btree key comparison/iteration */ +#define MAX_BSETS 4U + struct btree_iter { size_t size, used; struct btree_iter_set { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e6..ee372884c40 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include <linux/slab.h> #include <linux/bitops.h> @@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +static void bch_btree_node_read_done(struct btree *b) { - struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); - - if (error) - set_btree_node_io_error(b); - - bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) - ? "writing btree" : "reading btree"); - closure_put(cl); -} - -static void btree_bio_init(struct btree *b) -{ - BUG_ON(b->bio); - b->bio = bch_bbio_alloc(b->c); - - b->bio->bi_end_io = btree_bio_endio; - b->bio->bi_private = &b->io.cl; -} - -void bch_btree_read_done(struct closure *cl) -{ - struct btree *b = container_of(cl, struct btree, io.cl); - struct bset *i = b->sets[0].data; - struct btree_iter *iter = b->c->fill_iter; const char *err = "bad btree header"; - BUG_ON(b->nsets || b->written); - - bch_bbio_free(b->bio, b->c); - b->bio = NULL; + struct bset *i = b->sets[0].data; + struct btree_iter *iter; - mutex_lock(&b->c->fill_lock); + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; - if (btree_node_io_error(b) || - !i->seq) + if (!i->seq) goto err; for (; @@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) if (b->written < btree_blocks(b)) bch_bset_init_next(b); out: - - mutex_unlock(&b->c->fill_lock); - - spin_lock(&b->c->btree_read_time_lock); - bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); - spin_unlock(&b->c->btree_read_time_lock); - - smp_wmb(); /* read_done is our write lock */ - set_btree_node_read_done(b); - - closure_return(cl); + mempool_free(iter, b->c->fill_iter); + return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", @@ -247,48 +212,69 @@ err: goto out; } -void bch_btree_read(struct btree *b) +static void btree_node_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +void bch_btree_node_read(struct btree *b) { - BUG_ON(b->nsets || b->written); + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + + bch_bio_map(bio, b->sets[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); - if (!closure_trylock(&b->io.cl, &b->c->cl)) - BUG(); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); - b->io_start_time = local_clock(); + bch_bbio_free(bio, b->c); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|READ_SYNC; - b->bio->bi_size = KEY_SIZE(&b->key) << 9; + if (btree_node_io_error(b)) + goto err; - bch_bio_map(b->bio, b->sets[0].data); + bch_btree_node_read_done(b); - pr_debug("%s", pbtree(b)); - trace_bcache_btree_read(b->bio); - bch_submit_bbio(b->bio, b->c, &b->key, 0); + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, start_time); + spin_unlock(&b->c->btree_read_time_lock); - continue_at(&b->io.cl, bch_btree_read_done, system_wq); + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %lu", + PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); __closure_wake_up(&b->c->journal.wait); } - if (w->owner) - closure_put(w->owner); - w->prio_blocked = 0; w->journal = NULL; - w->owner = NULL; } -static void __btree_write_done(struct closure *cl) +static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct btree_write *w = btree_prev_write(b); @@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) closure_return(cl); } -static void btree_write_done(struct closure *cl) +static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct bio_vec *bv; @@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) __bio_for_each_segment(bv, b->bio, n, 0) __free_page(bv->bv_page); - __btree_write_done(cl); + __btree_node_write_done(cl); } -static void do_btree_write(struct btree *b) +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io.cl; struct bset *i = b->sets[b->nsets].data; @@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|WRITE_SYNC; - b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = &b->io.cl; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); - if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); - continue_at(cl, btree_write_done, NULL); + continue_at(cl, btree_node_write_done, NULL); } else { b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_write_done(cl); + __btree_node_write_done(cl); } } -static void __btree_write(struct btree *b) +void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; + trace_bcache_btree_write(b); + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(b->sets->data->seq != i->seq); + bch_check_key_order(b, i); - closure_lock(&b->io, &b->c->cl); cancel_delayed_work(&b->work); + /* If caller isn't waiting for write, parent refcount is cache set */ + closure_lock(&b->io, parent ?: &b->c->cl); + clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); - bch_check_key_order(b, i); - BUG_ON(b->written && !i->keys); - - do_btree_write(b); - - pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); + do_btree_node_write(b); b->written += set_blocks(i, b->c); atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, @@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) bch_bset_init_next(b); } -static void btree_write_work(struct work_struct *w) +static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - down_write(&b->lock); + rw_lock(true, b, b->level); if (btree_node_dirty(b)) - __btree_write(b); - up_write(&b->lock); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } -void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); - BUG_ON(b->written && - (b->written >= btree_blocks(b) || - i->seq != b->sets[0].data->seq || - !i->keys)); + BUG_ON(!b->written); + BUG_ON(!i->keys); - if (!btree_node_dirty(b)) { - set_btree_node_dirty(b); - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); - } + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); - w->prio_blocked += b->prio_blocked; - b->prio_blocked = 0; + set_btree_node_dirty(b); - if (op && op->journal && !b->level) { + if (op && op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) } } - if (current->bio_list) - return; - /* Force write if set is too big */ - if (now || - b->level || - set_bytes(i) > PAGE_SIZE - 48) { - if (op && now) { - /* Must wait on multiple writes */ - BUG_ON(w->owner); - w->owner = &op->cl; - closure_get(&op->cl); - } - - __btree_write(b); - } - BUG_ON(!b->written); + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); } /* @@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); INIT_LIST_HEAD(&b->list); - INIT_DELAYED_WORK(&b->work, btree_write_work); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; closure_init_unlocked(&b->io); @@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); if (cl && btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); if (cl) closure_wait_event_async(&b->io.wait, cl, @@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) else if (!mutex_trylock(&c->bucket_lock)) return -1; + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); @@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, int ret = -ENOMEM; struct btree *i; + trace_bcache_btree_cache_cannibalize(c); + if (!cl) return ERR_PTR(-ENOMEM); @@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); } - /* XXX: tracepoint */ c->try_harder = cl; c->try_harder_start = local_clock(); retry: @@ -905,6 +912,9 @@ retry: b = mca_find(c, k); if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + mutex_lock(&c->bucket_lock); b = mca_alloc(c, k, level, &op->cl); mutex_unlock(&c->bucket_lock); @@ -914,7 +924,7 @@ retry: if (IS_ERR(b)) return b; - bch_btree_read(b); + bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); @@ -937,15 +947,12 @@ retry: for (; i <= b->nsets; i++) prefetch(b->sets[i].data); - if (!closure_wait_event(&b->io.wait, &op->cl, - btree_node_read_done(b))) { - rw_unlock(write, b); - b = ERR_PTR(-EAGAIN); - } else if (btree_node_io_error(b)) { + if (btree_node_io_error(b)) { rw_unlock(write, b); - b = ERR_PTR(-EIO); - } else - BUG_ON(!b->written); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); return b; } @@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { - bch_btree_read(b); + bch_btree_node_read(b); rw_unlock(true, b); } } @@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) { unsigned i; + trace_bcache_btree_node_free(b); + /* * The BUG_ON() in btree_node_get() implies that we must have a write * lock on parent to free or even invalidate a node */ BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); - pr_debug("bucket %s", pbtree(b)); if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); - if (b->prio_blocked && - !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); - - b->prio_blocked = 0; - cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1028,17 +1030,20 @@ retry: goto retry; } - set_btree_node_read_done(b); b->accessed = 1; bch_bset_init_next(b); mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); return b; } @@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nkeys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) { + if (KEY_DIRTY(k)) gc->dirty += KEY_SIZE(k); - if (d) - d->sectors_dirty_gc += KEY_SIZE(k); - } } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1166,14 +1168,11 @@ static struct btree |