diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
| -rw-r--r-- | drivers/md/bcache/btree.c | 2195 | 
1 files changed, 1114 insertions, 1081 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f9764e61978..7347b610096 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,12 +23,13 @@  #include "bcache.h"  #include "btree.h"  #include "debug.h" -#include "request.h" -#include "writeback.h" +#include "extents.h"  #include <linux/slab.h>  #include <linux/bitops.h> +#include <linux/freezer.h>  #include <linux/hash.h> +#include <linux/kthread.h>  #include <linux/prefetch.h>  #include <linux/random.h>  #include <linux/rcupdate.h> @@ -67,15 +68,11 @@   * alloc_bucket() cannot fail. This should be true but is not completely   * obvious.   * - * Make sure all allocations get charged to the root cgroup - *   * Plugging?   *   * If data write is less than hard sector size of ssd, round up offset in open   * bucket to the next whole sector   * - * Also lookup by cgroup in get_open_bucket() - *   * Superblock needs to be fleshed out for multiple cache devices   *   * Add a sysfs tunable for the number of writeback IOs in flight @@ -88,15 +85,6 @@   * Test module load/unload   */ -static const char * const op_types[] = { -	"insert", "replace" -}; - -static const char *op_type(struct btree_op *op) -{ -	return op_types[op->type]; -} -  #define MAX_NEED_GC		64  #define MAX_SAVE_PRIO		72 @@ -105,23 +93,96 @@ static const char *op_type(struct btree_op *op)  #define PTR_HASH(c, k)							\  	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -struct workqueue_struct *bch_gc_wq; -static struct workqueue_struct *btree_io_wq; +#define insert_lock(s, b)	((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn:		function to call, which will be passed the child node + * @key:	key to recurse on + * @b:		parent btree node + * @op:		pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...)					\ +({									\ +	int _r, l = (b)->level - 1;					\ +	bool _w = l <= (op)->lock;					\ +	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ +	if (!IS_ERR(_child)) {						\ +		_child->parent = (b);					\ +		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\ +		rw_unlock(_w, _child);					\ +	} else								\ +		_r = PTR_ERR(_child);					\ +	_r;								\ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn:		function to call, which will be passed the child node + * @c:		cache set + * @op:		pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...)					\ +({									\ +	int _r = -EINTR;						\ +	do {								\ +		struct btree *_b = (c)->root;				\ +		bool _w = insert_lock(op, _b);				\ +		rw_lock(_w, _b, _b->level);				\ +		if (_b == (c)->root &&					\ +		    _w == insert_lock(op, _b)) {			\ +			_b->parent = NULL;				\ +			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\ +		}							\ +		rw_unlock(_w, _b);					\ +		bch_cannibalize_unlock(c);				\ +		if (_r == -EINTR)					\ +			schedule();					\ +	} while (_r == -EINTR);						\ +									\ +	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\ +	_r;								\ +}) + +static inline struct bset *write_block(struct btree *b) +{ +	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + +static void bch_btree_init_next(struct btree *b) +{ +	/* If not a leaf node, always sort */ +	if (b->level && b->keys.nsets) +		bch_btree_sort(&b->keys, &b->c->sort); +	else +		bch_btree_sort_lazy(&b->keys, &b->c->sort); + +	if (b->written < btree_blocks(b)) +		bch_bset_init_next(&b->keys, write_block(b), +				   bset_magic(&b->c->sb)); -void bch_btree_op_init_stack(struct btree_op *op) -{ -	memset(op, 0, sizeof(struct btree_op)); -	closure_init_stack(&op->cl); -	op->lock = -1; -	bch_keylist_init(&op->keys);  }  /* Btree key manipulation */ -static void bkey_put(struct cache_set *c, struct bkey *k, int level) +void bkey_put(struct cache_set *c, struct bkey *k)  { -	if ((level && KEY_OFFSET(k)) || !level) -		__bkey_put(c, k); +	unsigned i; + +	for (i = 0; i < KEY_PTRS(k); i++) +		if (ptr_available(c, k, i)) +			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);  }  /* Btree IO */ @@ -129,38 +190,43 @@ static void bkey_put(struct cache_set *c, struct bkey *k, int level)  static uint64_t btree_csum_set(struct btree *b, struct bset *i)  {  	uint64_t crc = b->key.ptr[0]; -	void *data = (void *) i + 8, *end = end(i); +	void *data = (void *) i + 8, *end = bset_bkey_last(i);  	crc = bch_crc64_update(crc, data, end - data);  	return crc ^ 0xffffffffffffffffULL;  } -static void bch_btree_node_read_done(struct btree *b) +void bch_btree_node_read_done(struct btree *b)  {  	const char *err = "bad btree header"; -	struct bset *i = b->sets[0].data; +	struct bset *i = btree_bset_first(b);  	struct btree_iter *iter;  	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);  	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;  	iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG +	iter->b = &b->keys; +#endif +  	if (!i->seq)  		goto err;  	for (; -	     b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; +	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;  	     i = write_block(b)) {  		err = "unsupported bset version";  		if (i->version > BCACHE_BSET_VERSION)  			goto err;  		err = "bad btree header"; -		if (b->written + set_blocks(i, b->c) > btree_blocks(b)) +		if (b->written + set_blocks(i, block_bytes(b->c)) > +		    btree_blocks(b))  			goto err;  		err = "bad magic"; -		if (i->magic != bset_magic(b->c)) +		if (i->magic != bset_magic(&b->c->sb))  			goto err;  		err = "bad checksum"; @@ -176,39 +242,40 @@ static void bch_btree_node_read_done(struct btree *b)  		}  		err = "empty set"; -		if (i != b->sets[0].data && !i->keys) +		if (i != b->keys.set[0].data && !i->keys)  			goto err; -		bch_btree_iter_push(iter, i->start, end(i)); +		bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); -		b->written += set_blocks(i, b->c); +		b->written += set_blocks(i, block_bytes(b->c));  	}  	err = "corrupted btree";  	for (i = write_block(b); -	     index(i, b) < btree_blocks(b); +	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);  	     i = ((void *) i) + block_bytes(b->c)) -		if (i->seq == b->sets[0].data->seq) +		if (i->seq == b->keys.set[0].data->seq)  			goto err; -	bch_btree_sort_and_fix_extents(b, iter); +	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); -	i = b->sets[0].data; +	i = b->keys.set[0].data;  	err = "short btree key"; -	if (b->sets[0].size && -	    bkey_cmp(&b->key, &b->sets[0].end) < 0) +	if (b->keys.set[0].size && +	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)  		goto err;  	if (b->written < btree_blocks(b)) -		bch_bset_init_next(b); +		bch_bset_init_next(&b->keys, write_block(b), +				   bset_magic(&b->c->sb));  out:  	mempool_free(iter, b->c->fill_iter);  	return;  err:  	set_btree_node_io_error(b); -	bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", +	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",  			    err, PTR_BUCKET_NR(b->c, &b->key, 0), -			    index(i, b), i->keys); +			    bset_block_offset(b, i), i->keys);  	goto out;  } @@ -218,7 +285,7 @@ static void btree_node_read_endio(struct bio *bio, int error)  	closure_put(cl);  } -void bch_btree_node_read(struct btree *b) +static void bch_btree_node_read(struct btree *b)  {  	uint64_t start_time = local_clock();  	struct closure cl; @@ -230,11 +297,11 @@ void bch_btree_node_read(struct btree *b)  	bio = bch_bbio_alloc(b->c);  	bio->bi_rw	= REQ_META|READ_SYNC; -	bio->bi_size	= KEY_SIZE(&b->key) << 9; +	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;  	bio->bi_end_io	= btree_node_read_endio;  	bio->bi_private	= &cl; -	bch_bio_map(bio, b->sets[0].data); +	bch_bio_map(bio, b->keys.set[0].data);  	bch_submit_bbio(bio, b->c, &b->key, 0);  	closure_sync(&cl); @@ -248,14 +315,11 @@ void bch_btree_node_read(struct btree *b)  		goto err;  	bch_btree_node_read_done(b); - -	spin_lock(&b->c->btree_read_time_lock);  	bch_time_stats_update(&b->c->btree_read_time, start_time); -	spin_unlock(&b->c->btree_read_time_lock);  	return;  err: -	bch_cache_set_error(b->c, "io error reading bucket %lu", +	bch_cache_set_error(b->c, "io error reading bucket %zu",  			    PTR_BUCKET_NR(b->c, &b->key, 0));  } @@ -274,9 +338,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)  	w->journal	= NULL;  } +static void btree_node_write_unlock(struct closure *cl) +{ +	struct btree *b = container_of(cl, struct btree, io); + +	up(&b->io_mutex); +} +  static void __btree_node_write_done(struct closure *cl)  { -	struct btree *b = container_of(cl, struct btree, io.cl); +	struct btree *b = container_of(cl, struct btree, io);  	struct btree_write *w = btree_prev_write(b);  	bch_bbio_free(b->bio, b->c); @@ -284,19 +355,18 @@ static void __btree_node_write_done(struct closure *cl)  	btree_complete_write(b, w);  	if (btree_node_dirty(b)) -		queue_delayed_work(btree_io_wq, &b->work, -				   msecs_to_jiffies(30000)); +		schedule_delayed_work(&b->work, 30 * HZ); -	closure_return(cl); +	closure_return_with_destructor(cl, btree_node_write_unlock);  }  static void btree_node_write_done(struct closure *cl)  { -	struct btree *b = container_of(cl, struct btree, io.cl); +	struct btree *b = container_of(cl, struct btree, io);  	struct bio_vec *bv;  	int n; -	__bio_for_each_segment(bv, b->bio, n, 0) +	bio_for_each_segment_all(bv, b->bio, n)  		__free_page(bv->bv_page);  	__btree_node_write_done(cl); @@ -305,7 +375,7 @@ static void btree_node_write_done(struct closure *cl)  static void btree_node_write_endio(struct bio *bio, int error)  {  	struct closure *cl = bio->bi_private; -	struct btree *b = container_of(cl, struct btree, io.cl); +	struct btree *b = container_of(cl, struct btree, io);  	if (error)  		set_btree_node_io_error(b); @@ -316,8 +386,8 @@ static void btree_node_write_endio(struct bio *bio, int error)  static void do_btree_node_write(struct btree *b)  { -	struct closure *cl = &b->io.cl; -	struct bset *i = b->sets[b->nsets].data; +	struct closure *cl = &b->io; +	struct bset *i = btree_bset_last(b);  	BKEY_PADDED(key) k;  	i->version	= BCACHE_BSET_VERSION; @@ -327,9 +397,9 @@ static void do_btree_node_write(struct btree *b)  	b->bio = bch_bbio_alloc(b->c);  	b->bio->bi_end_io	= btree_node_write_endio; -	b->bio->bi_private	= &b->io.cl; +	b->bio->bi_private	= cl;  	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA; -	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c); +	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));  	bch_bio_map(b->bio, i);  	/* @@ -348,14 +418,15 @@ static void do_btree_node_write(struct btree *b)  	 */  	bkey_copy(&k.key, &b->key); -	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); +	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + +		       bset_sector_offset(&b->keys, i));  	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {  		int j;  		struct bio_vec *bv;  		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); -		bio_for_each_segment(bv, b->bio, j) +		bio_for_each_segment_all(bv, b->bio, j)  			memcpy(page_address(bv->bv_page),  			       base + j * PAGE_SIZE, PAGE_SIZE); @@ -369,75 +440,106 @@ static void do_btree_node_write(struct btree *b)  		bch_submit_bbio(b->bio, b->c, &k.key, 0);  		closure_sync(cl); -		__btree_node_write_done(cl); +		continue_at_nobarrier(cl, __btree_node_write_done, NULL);  	}  } -void bch_btree_node_write(struct btree *b, struct closure *parent) +void __bch_btree_node_write(struct btree *b, struct closure *parent)  { -	struct bset *i = b->sets[b->nsets].data; +	struct bset *i = btree_bset_last(b); + +	lockdep_assert_held(&b->write_lock);  	trace_bcache_btree_write(b);  	BUG_ON(current->bio_list);  	BUG_ON(b->written >= btree_blocks(b));  	BUG_ON(b->written && !i->keys); -	BUG_ON(b->sets->data->seq != i->seq); -	bch_check_key_order(b, i); +	BUG_ON(btree_bset_first(b)->seq != i->seq); +	bch_check_keys(&b->keys, "writing");  	cancel_delayed_work(&b->work);  	/* If caller isn't waiting for write, parent refcount is cache set */ -	closure_lock(&b->io, parent ?: &b->c->cl); +	down(&b->io_mutex); +	closure_init(&b->io, parent ?: &b->c->cl);  	clear_bit(BTREE_NODE_dirty,	 &b->flags);  	change_bit(BTREE_NODE_write_idx, &b->flags);  	do_btree_node_write(b); -	b->written += set_blocks(i, b->c); -	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, +	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,  			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); -	bch_btree_sort_lazy(b); +	b->written += set_blocks(i, block_bytes(b->c)); +} -	if (b->written < btree_blocks(b)) -		bch_bset_init_next(b); +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ +	unsigned nsets = b->keys.nsets; + +	lockdep_assert_held(&b->lock); + +	__bch_btree_node_write(b, parent); + +	/* +	 * do verify if there was more than one set initially (i.e. we did a +	 * sort) and we sorted down to a single set: +	 */ +	if (nsets && !b->keys.nsets) +		bch_btree_verify(b); + +	bch_btree_init_next(b); +} + +static void bch_btree_node_write_sync(struct btree *b) +{ +	struct closure cl; + +	closure_init_stack(&cl); + +	mutex_lock(&b->write_lock); +	bch_btree_node_write(b, &cl); +	mutex_unlock(&b->write_lock); + +	closure_sync(&cl);  }  static void btree_node_write_work(struct work_struct *w)  {  	struct btree *b = container_of(to_delayed_work(w), struct btree, work); -	rw_lock(true, b, b->level); - +	mutex_lock(&b->write_lock);  	if (btree_node_dirty(b)) -		bch_btree_node_write(b, NULL); -	rw_unlock(true, b); +		__bch_btree_node_write(b, NULL); +	mutex_unlock(&b->write_lock);  } -static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)  { -	struct bset *i = b->sets[b->nsets].data; +	struct bset *i = btree_bset_last(b);  	struct btree_write *w = btree_current_write(b); +	lockdep_assert_held(&b->write_lock); +  	BUG_ON(!b->written);  	BUG_ON(!i->keys);  	if (!btree_node_dirty(b)) -		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); +		schedule_delayed_work(&b->work, 30 * HZ);  	set_btree_node_dirty(b); -	if (op && op->journal) { +	if (journal_ref) {  		if (w->journal && -		    journal_pin_cmp(b->c, w, op)) { +		    journal_pin_cmp(b->c, w->journal, journal_ref)) {  			atomic_dec_bug(w->journal);  			w->journal = NULL;  		}  		if (!w->journal) { -			w->journal = op->journal; +			w->journal = journal_ref;  			atomic_inc(w->journal);  		}  	} @@ -453,53 +555,19 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)   * mca -> memory cache   */ -static void mca_reinit(struct btree *b) -{ -	unsigned i; - -	b->flags	= 0; -	b->written	= 0; -	b->nsets	= 0; - -	for (i = 0; i < MAX_BSETS; i++) -		b->sets[i].size = 0; -	/* -	 * Second loop starts at 1 because b->sets[0]->data is the memory we -	 * allocated -	 */ -	for (i = 1; i < MAX_BSETS; i++) -		b->sets[i].data = NULL; -} -  #define mca_reserve(c)	(((c->root && c->root->level)		\  			  ? c->root->level : 1) * 8 + 16)  #define mca_can_free(c)						\ -	max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) +	max_t(int, 0, c->btree_cache_used - mca_reserve(c))  static void mca_data_free(struct btree *b)  { -	struct bset_tree *t = b->sets; -	BUG_ON(!closure_is_unlocked(&b->io.cl)); - -	if (bset_prev_bytes(b) < PAGE_SIZE) -		kfree(t->prev); -	else -		free_pages((unsigned long) t->prev, -			   get_order(bset_prev_bytes(b))); - -	if (bset_tree_bytes(b) < PAGE_SIZE) -		kfree(t->tree); -	else -		free_pages((unsigned long) t->tree, -			   get_order(bset_tree_bytes(b))); +	BUG_ON(b->io_mutex.count != 1); -	free_pages((unsigned long) t->data, b->page_order); +	bch_btree_keys_free(&b->keys); -	t->prev = NULL; -	t->tree = NULL; -	t->data = NULL; +	b->c->btree_cache_used--;  	list_move(&b->list, &b->c->btree_cache_freed); -	b->c->bucket_cache_used--;  }  static void mca_bucket_free(struct btree *b) @@ -518,34 +586,16 @@ static unsigned btree_order(struct bkey *k)  static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)  { -	struct bset_tree *t = b->sets; -	BUG_ON(t->data); - -	b->page_order = max_t(unsigned, -			      ilog2(b->c->btree_pages), -			      btree_order(k)); - -	t->data = (void *) __get_free_pages(gfp, b->page_order); -	if (!t->data) -		goto err; - -	t->tree = bset_tree_bytes(b) < PAGE_SIZE -		? kmalloc(bset_tree_bytes(b), gfp) -		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); -	if (!t->tree) -		goto err; - -	t->prev = bset_prev_bytes(b) < PAGE_SIZE -		? kmalloc(bset_prev_bytes(b), gfp) -		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); -	if (!t->prev) -		goto err; - -	list_move(&b->list, &b->c->btree_cache); -	b->c->bucket_cache_used++; -	return; -err: -	mca_data_free(b); +	if (!bch_btree_keys_alloc(&b->keys, +				  max_t(unsigned, +					ilog2(b->c->btree_pages), +					btree_order(k)), +				  gfp)) { +		b->c->btree_cache_used++; +		list_move(&b->list, &b->c->btree_cache); +	} else { +		list_move(&b->list, &b->c->btree_cache_freed); +	}  }  static struct btree *mca_bucket_alloc(struct cache_set *c, @@ -557,44 +607,56 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,  	init_rwsem(&b->lock);  	lockdep_set_novalidate_class(&b->lock); +	mutex_init(&b->write_lock); +	lockdep_set_novalidate_class(&b->write_lock);  	INIT_LIST_HEAD(&b->list);  	INIT_DELAYED_WORK(&b->work, btree_node_write_work);  	b->c = c; -	closure_init_unlocked(&b->io); +	sema_init(&b->io_mutex, 1);  	mca_data_alloc(b, k, gfp);  	return b;  } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush)  { +	struct closure cl; + +	closure_init_stack(&cl);  	lockdep_assert_held(&b->c->bucket_lock);  	if (!down_write_trylock(&b->lock))  		return -ENOMEM; -	if (b->page_order < min_order) { -		rw_unlock(true, b); -		return -ENOMEM; -	} +	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); -	BUG_ON(btree_node_dirty(b) && !b->sets[0].data); +	if (b->keys.page_order < min_order) +		goto out_unlock; -	if (cl && btree_node_dirty(b)) -		bch_btree_node_write(b, NULL); - -	if (cl) -		closure_wait_event_async(&b->io.wait, cl, -			 atomic_read(&b->io.cl.remaining) == -1); +	if (!flush) { +		if (btree_node_dirty(b)) +			goto out_unlock; -	if (btree_node_dirty(b) || -	    !closure_is_unlocked(&b->io.cl) || -	    work_pending(&b->work.work)) { -		rw_unlock(true, b); -		return -EAGAIN; +		if (down_trylock(&b->io_mutex)) +			goto out_unlock; +		up(&b->io_mutex);  	} +	mutex_lock(&b->write_lock); +	if (btree_node_dirty(b)) +		__bch_btree_node_write(b, &cl); +	mutex_unlock(&b->write_lock); + +	closure_sync(&cl); + +	/* wait for any in flight btree write */ +	down(&b->io_mutex); +	up(&b->io_mutex); +  	return 0; +out_unlock: +	rw_unlock(true, b); +	return -ENOMEM;  }  static unsigned long bch_mca_scan(struct shrinker *shrink, @@ -608,11 +670,11 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,  	if (c->shrinker_disabled)  		return SHRINK_STOP; -	if (c->try_harder) +	if (c->btree_cache_alloc_lock)  		return SHRINK_STOP;  	/* Return -1 if we can't do anything right now */ -	if (sc->gfp_mask & __GFP_WAIT) +	if (sc->gfp_mask & __GFP_IO)  		mutex_lock(&c->bucket_lock);  	else if (!mutex_trylock(&c->bucket_lock))  		return -1; @@ -633,26 +695,22 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,  			break;  		if (++i > 3 && -		    !mca_reap(b, NULL, 0)) { +		    !mca_reap(b, 0, false)) {  			mca_data_free(b);  			rw_unlock(true, b);  			freed++;  		}  	} -	/* -	 * Can happen right when we first start up, before we've read in any -	 * btree nodes -	 */ -	if (list_empty(&c->btree_cache)) -		goto out; +	for (i = 0; (nr--) && i < c->btree_cache_used; i++) { +		if (list_empty(&c->btree_cache)) +			goto out; -	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {  		b = list_first_entry(&c->btree_cache, struct btree, list);  		list_rotate_left(&c->btree_cache);  		if (!b->accessed && -		    !mca_reap(b, NULL, 0)) { +		    !mca_reap(b, 0, false)) {  			mca_bucket_free(b);  			mca_data_free(b);  			rw_unlock(true, b); @@ -673,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,  	if (c->shrinker_disabled)  		return 0; -	if (c->try_harder) +	if (c->btree_cache_alloc_lock)  		return 0;  	return mca_can_free(c) * c->btree_pages; @@ -693,6 +751,8 @@ void bch_btree_cache_free(struct cache_set *c)  #ifdef CONFIG_BCACHE_DEBUG  	if (c->verify_data)  		list_move(&c->verify_data->list, &c->btree_cache); + +	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));  #endif  	list_splice(&c->btree_cache_freeable, @@ -723,12 +783,9 @@ int bch_btree_cache_alloc(struct cache_set *c)  {  	unsigned i; -	/* XXX: doesn't check for errors */ - -	closure_init_unlocked(&c->gc); -  	for (i = 0; i < mca_reserve(c); i++) -		mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); +		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) +			return -ENOMEM;  	list_splice_init(&c->btree_cache,  			 &c->btree_cache_freeable); @@ -736,10 +793,13 @@ int bch_btree_cache_alloc(struct cache_set *c)  #ifdef CONFIG_BCACHE_DEBUG  	mutex_init(&c->verify_lock); +	c->verify_ondisk = (void *) +		__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); +  	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);  	if (c->verify_data && -	    c->verify_data->sets[0].data) +	    c->verify_data->keys.set->data)  		list_del_init(&c->verify_data->list);  	else  		c->verify_data = NULL; @@ -775,52 +835,41 @@ out:  	return b;  } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, -				     int level, struct closure *cl) +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)  { -	int ret = -ENOMEM; -	struct btree *i; +	struct task_struct *old; -	trace_bcache_btree_cache_cannibalize(c); +	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); +	if (old && old != current) { +		if (op) +			prepare_to_wait(&c->btree_cache_wait, &op->wait, +					TASK_UNINTERRUPTIBLE); +		return -EINTR; +	} + +	return 0; +} -	if (!cl) -		return ERR_PTR(-ENOMEM); +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, +				     struct bkey *k) +{ +	struct btree *b; -	/* -	 * Trying to free up some memory - i.e. reuse some btree nodes - may -	 * require initiating IO to flush the dirty part of the node. If we're -	 * running under generic_make_request(), that IO will never finish and -	 * we would deadlock. Returning -EAGAIN causes the cache lookup code to -	 * punt to workqueue and retry. -	 */ -	if (current->bio_list) -		return ERR_PTR(-EAGAIN); +	trace_bcache_btree_cache_cannibalize(c); -	if (c->try_harder && c->try_harder != cl) { -		closure_wait_event_async(&c->try_wait, cl, !c->try_harder); -		return ERR_PTR(-EAGAIN); -	} +	if (mca_cannibalize_lock(c, op)) +		return ERR_PTR(-EINTR); -	c->try_harder = cl; -	c->try_harder_start = local_clock(); -retry: -	list_for_each_entry_reverse(i, &c->btree_cache, list) { -		int r = mca_reap(i, cl, btree_order(k)); -		if (!r) -			return i; -		if (r != -ENOMEM) -			ret = r; -	} +	list_for_each_entry_reverse(b, &c->btree_cache, list) +		if (!mca_reap(b, btree_order(k), false)) +			return b; -	if (ret == -EAGAIN && -	    closure_blocking(cl)) { -		mutex_unlock(&c->bucket_lock); -		closure_sync(cl); -		mutex_lock(&c->bucket_lock); -		goto retry; -	} +	list_for_each_entry_reverse(b, &c->btree_cache, list) +		if (!mca_reap(b, btree_order(k), true)) +			return b; -	return ERR_PTR(ret); +	WARN(1, "btree cache cannibalize failed\n"); +	return ERR_PTR(-ENOMEM);  }  /* @@ -829,20 +878,21 @@ retry:   * cannibalize_bucket() will take. This means every time we unlock the root of   * the btree, we need to release this lock if we have it held.   */ -void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +static void bch_cannibalize_unlock(struct cache_set *c)  { -	if (c->try_harder == cl) { -		bch_time_stats_update(&c->try_harder_time, c->try_harder_start); -		c->try_harder = NULL; -		__closure_wake_up(&c->try_wait); +	if (c->btree_cache_alloc_lock == current) { +		c->btree_cache_alloc_lock = NULL; +		wake_up(&c->btree_cache_wait);  	}  } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, -			       int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, +			       struct bkey *k, int level)  {  	struct btree *b; +	BUG_ON(current->bio_list); +  	lockdep_assert_held(&c->bucket_lock);  	if (mca_find(c, k)) @@ -852,16 +902,16 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,  	 * the list. Check if there's any freed nodes there:  	 */  	list_for_each_entry(b, &c->btree_cache_freeable, list) -		if (!mca_reap(b, NULL, btree_order(k))) +		if (!mca_reap(b, btree_order(k), false))  			goto out;  	/* We never free struct btree itself, just the memory that holds the on  	 * disk node. Check the freed list before allocating a new one:  	 */  	list_for_each_entry(b, &c->btree_cache_freed, list) -		if (!mca_reap(b, NULL, 0)) { +		if (!mca_reap(b, 0, false)) {  			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); -			if (!b->sets[0].data) +			if (!b->keys.set[0].data)  				goto err;  			else  				goto out; @@ -872,10 +922,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,  		goto err;  	BUG_ON(!down_write_trylock(&b->lock)); -	if (!b->sets->data) +	if (!b->keys.set->data)  		goto err;  out: -	BUG_ON(!closure_is_unlocked(&b->io.cl)); +	BUG_ON(b->io_mutex.count != 1);  	bkey_copy(&b->key, k);  	list_move(&b->list, &c->btree_cache); @@ -883,16 +933,24 @@ out:  	hlist_add_head_rcu(&b->hash, mca_hash(c, k));  	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); +	b->parent	= (void *) ~0UL; +	b->flags	= 0; +	b->written	= 0;  	b->level	= level; -	mca_reinit(b); +	if (!b->level) +		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, +				    &b->c->expensive_debug_checks); +	else +		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, +				    &b->c->expensive_debug_checks);  	return b;  err:  	if (b)  		rw_unlock(true, b); -	b = mca_cannibalize(c, k, level, cl); +	b = mca_cannibalize(c, op, k);  	if (!IS_ERR(b))  		goto out; @@ -903,17 +961,15 @@ err:   * bch_btree_node_get - find a btree node in the cache and lock it, reading it   * in from disk if necessary.   * - * If IO is necessary, it uses the closure embedded in struct btree_op to wait; - * if that closure is in non blocking mode, will return -EAGAIN. + * If IO is necessary and running under generic_make_request, returns -EAGAIN.   *   * The btree node will have either a read or a write lock held, depending on   * level and op->lock.   */ -struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, -				 int level, struct btree_op *op) +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, +				 struct bkey *k, int level, bool write)  {  	int i = 0; -	bool write = level <= op->lock;  	struct btree *b;  	BUG_ON(level < 0); @@ -925,7 +981,7 @@ retry:  			return ERR_PTR(-EAGAIN);  		mutex_lock(&c->bucket_lock); -		b = mca_alloc(c, k, level, &op->cl); +		b = mca_alloc(c, op, k, level);  		mutex_unlock(&c->bucket_lock);  		if (!b) @@ -948,13 +1004,13 @@ retry:  	b->accessed = 1; -	for (; i <= b->nsets && b->sets[i].size; i++) { -		prefetch(b->sets[i].tree); -		prefetch(b->sets[i].data); +	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { +		prefetch(b->keys.set[i].tree); +		prefetch(b->keys.set[i].data);  	} -	for (; i <= b->nsets; i++) -		prefetch(b->sets[i].data); +	for (; i <= b->keys.nsets; i++) +		prefetch(b->keys.set[i].data);  	if (btree_node_io_error(b)) {  		rw_unlock(write, b); @@ -971,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)  	struct btree *b;  	mutex_lock(&c->bucket_lock); -	b = mca_alloc(c, k, level, NULL); +	b = mca_alloc(c, NULL, k, level);  	mutex_unlock(&c->bucket_lock);  	if (!IS_ERR_OR_NULL(b)) { @@ -982,65 +1038,54 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)  /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b)  { -	unsigned i; -  	trace_bcache_btree_node_free(b); -	/* -	 * The BUG_ON() in btree_node_get() implies that we must have a write -	 * lock on parent to free or even invalidate a node -	 */ -	BUG_ON(op->lock <= b->level);  	BUG_ON(b == b->c->root); +	mutex_lock(&b->write_lock); +  	if (btree_node_dirty(b))  		btree_complete_write(b, btree_current_write(b));  	clear_bit(BTREE_NODE_dirty, &b->flags); +	mutex_unlock(&b->write_lock); +  	cancel_delayed_work(&b->work);  	mutex_lock(&b->c->bucket_lock); - -	for (i = 0; i < KEY_PTRS(&b->key); i++) { -		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); - -		bch_inc_gen(PTR_CACHE(b->c, &b->key, i), -			    PTR_BUCKET(b->c, &b->key, i)); -	} -  	bch_bucket_free(b->c, &b->key);  	mca_bucket_free(b);  	mutex_unlock(&b->c->bucket_lock);  } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, -				   struct closure *cl) +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, +				   int level)  {  	BKEY_PADDED(key) k;  	struct btree *b = ERR_PTR(-EAGAIN);  	mutex_lock(&c->bucket_lock);  retry: -	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) +	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))  		goto err; +	bkey_put(c, &k.key);  	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); -	b = mca_alloc(c, &k.key, level, cl); +	b = mca_alloc(c, op, &k.key, level);  	if (IS_ERR(b))  		goto err_free;  	if (!b) {  		cache_bug(c,  			"Tried to allocate bucket that was in btree cache"); -		__bkey_put(c, &k.key);  		goto retry;  	}  	b->accessed = 1; -	bch_bset_init_next(b); +	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));  	mutex_unlock(&c->bucket_lock); @@ -1048,7 +1093,6 @@ retry:  	return b;  err_free:  	bch_bucket_free(c, &k.key); -	__bkey_put(c, &k.key);  err:  	mutex_unlock(&c->bucket_lock); @@ -1057,18 +1101,64 @@ err:  }  static struct btree *btree_node_alloc_replacement(struct btree *b, -						  struct closure *cl) +						  struct btree_op *op)  { -	struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); -	if (!IS_ERR_OR_NULL(n)) -		bch_btree_sort_into(b, n); +	struct btree *n = bch_btree_node_alloc(b->c, op, b->level); +	if (!IS_ERR_OR_NULL(n)) { +		mutex_lock(&n->write_lock); +		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); +		bkey_copy_key(&n->key, &b->key); +		mutex_unlock(&n->write_lock); +	}  	return n;  } +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ +	unsigned i; + +	mutex_lock(&b->c->bucket_lock); + +	atomic_inc(&b->c->prio_blocked); + +	bkey_copy(k, &b->key); +	bkey_copy_key(k, &ZERO_KEY); + +	for (i = 0; i < KEY_PTRS(k); i++) +		SET_PTR_GEN(k, i, +			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i), +					PTR_BUCKET(b->c, &b->key, i))); + +	mutex_unlock(&b->c->bucket_lock); +} + +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ +	struct cache_set *c = b->c; +	struct cache *ca; +	unsigned i, reserve = (c->root->level - b->level) * 2 + 1; + +	mutex_lock(&c->bucket_lock); + +	for_each_cache(ca, c, i) +		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { +			if (op) +				prepare_to_wait(&c->btree_cache_wait, &op->wait, +						TASK_UNINTERRUPTIBLE); +			mutex_unlock(&c->bucket_lock); +			return -EINTR; +		} + +	mutex_unlock(&c->bucket_lock); + +	return mca_cannibalize_lock(b->c, op); +} +  /* Garbage collection */ -uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, +				    struct bkey *k)  {  	uint8_t stale = 0;  	unsigned i; @@ -1088,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)  		g = PTR_BUCKET(c, k, i); -		if (gen_after(g->gc_gen, PTR_GEN(k, i))) -			g->gc_gen = PTR_GEN(k, i); +		if (gen_after(g->last_gc, PTR_GEN(k, i))) +			g->last_gc = PTR_GEN(k, i);  		if (ptr_stale(c, k, i)) {  			stale = max(stale, ptr_stale(c, k, i)); @@ -1105,11 +1195,13 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)  			SET_GC_MARK(g, GC_MARK_METADATA);  		else if (KEY_DIRTY(k))  			SET_GC_MARK(g, GC_MARK_DIRTY); +		else if (!GC_MARK(g)) +			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);  		/* guard against overflow */  		SET_GC_SECTORS_USED(g, min_t(unsigned,  					     GC_SECTORS_USED(g) + KEY_SIZE(k), -					     (1 << 14) - 1)); +					     MAX_GC_SECTORS_USED));  		BUG_ON(!GC_SECTORS_USED(g));  	} @@ -1119,120 +1211,143 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)  #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k) -static int btree_gc_mark_node(struct btree *b, unsigned *keys, -			      struct gc_stat *gc) +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ +	unsigned i; + +	for (i = 0; i < KEY_PTRS(k); i++) +		if (ptr_available(c, k, i) && +		    !ptr_stale(c, k, i)) { +			struct bucket *b = PTR_BUCKET(c, k, i); + +			b->gen = PTR_GEN(k, i); + +			if (level && bkey_cmp(k, &ZERO_KEY)) +				b->prio = BTREE_PRIO; +			else if (!level && b->prio == BTREE_PRIO) +				b->prio = INITIAL_PRIO; +		} + +	__bch_btree_mark_key(c, level, k); +} + +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)  {  	uint8_t stale = 0; -	unsigned last_dev = -1; -	struct bcache_device *d = NULL; +	unsigned keys = 0, good_keys = 0;  	struct bkey *k;  	struct btree_iter iter;  	struct bset_tree *t;  	gc->nodes++; -	for_each_key_filter(b, k, &iter, bch_ptr_invalid) { -		if (last_dev != KEY_INODE(k)) { -			last_dev = KEY_INODE(k); - -			d = KEY_INODE(k) < b->c->nr_uuids -				? b->c->devices[last_dev] -				: NULL; -		} - +	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {  		stale = max(stale, btree_mark_key(b, k)); +		keys++; -		if (bch_ptr_bad(b, k)) +		if (bch_ptr_bad(&b->keys, k))  			continue; -		*keys += bkey_u64s(k); -  		gc->key_bytes += bkey_u64s(k);  		gc->nkeys++; +		good_keys++;  		gc->data += KEY_SIZE(k); -		if (KEY_DIRTY(k)) -			gc->dirty += KEY_SIZE(k);  	} -	for (t = b->sets; t <= &b->sets[b->nsets]; t++) +	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)  		btree_bug_on(t->size && -			     bset_written(b, t) && +			     bset_written(&b->keys, t) &&  			     bkey_cmp(&b->key, &t->end) < 0,  			     b, "found short btree key in gc"); -	return stale; -} - -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, -				    struct btree_op *op) -{ -	/* -	 * We block priorities from being written for the duration of garbage -	 * collection, so we can't sleep in btree_alloc() -> -	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it -	 * our closure. -	 */ -	struct btree *n = btree_node_alloc_replacement(b, NULL); - -	if (!IS_ERR_OR_NULL(n)) { -		swap(b, n); -		__bkey_put(b->c, &b->key); +	if (b->c->gc_always_rewrite) +		return true; -		memcpy(k->ptr, b->key.ptr, -		       sizeof(uint64_t) * KEY_PTRS(&b->key)); +	if (stale > 10) +		return true; -		btree_node_free(n, op); -		up_write(&n->lock); -	} +	if ((keys - good_keys) * 2 > keys) +		return true; -	return b; +	return false;  } -/* - * Leaving this at 2 until we've got incremental garbage collection done; it - * could be higher (and has been tested with 4) except that garbage collection - * could take much longer, adversely affecting latency. - */ -#define GC_MERGE_NODES	2U +#define GC_MERGE_NODES	4U  struct gc_merge_info {  	struct btree	*b; -	struct bkey	*k;  	unsigned	keys;  }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, -			      struct gc_stat *gc, struct gc_merge_info *r) +static int bch_btree_insert_node(struct btree *, struct btree_op *, +				 struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, +			     struct gc_stat *gc, struct gc_merge_info *r)  { -	unsigned nodes = 0, keys = 0, blocks; -	int i; +	unsigned i, nodes = 0, keys = 0, blocks; +	struct btree *new_nodes[GC_MERGE_NODES]; +	struct keylist keylist; +	struct closure cl; +	struct bkey *k; + +	bch_keylist_init(&keylist); + +	if (btree_check_reserve(b, NULL)) +		return 0; + +	memset(new_nodes, 0, sizeof(new_nodes)); +	closure_init_stack(&cl); -	while (nodes < GC_MERGE_NODES && r[nodes].b) +	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))  		keys += r[nodes++].keys;  	blocks = btree_default_blocks(b->c) * 2 / 3;  	if (nodes < 2 || -	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) -		return; - -	for (i = nodes - 1; i >= 0; --i) { -		if (r[i].b->written) -			r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); +	    __set_blocks(b->keys.set[0].data, keys, +			 block_bytes(b->c)) > blocks * (nodes - 1)) +		return 0; -		if (r[i].b->written) -			return; +	for (i = 0; i < nodes; i++) { +		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); +		if (IS_ERR_OR_NULL(new_nodes[i])) +			goto out_nocoalesce;  	} +	/* +	 * We have to check the reserve here, after we've allocated our new +	 * nodes, to make sure the insert below will succeed - we also check +	 * before as an optimization to potentially avoid a bunch of expensive +	 * allocs/sorts +	 */ +	if (btree_check_reserve(b, NULL)) +		goto out_nocoalesce; + +	for (i = 0; i < nodes; i++) +		mutex_lock(&new_nodes[i]->write_lock); +  	for (i = nodes - 1; i > 0; --i) { -		struct bset *n1 = r[i].b->sets->data; -		struct bset *n2 = r[i - 1].b->sets->data; +		struct bset *n1 = btree_bset_first(new_nodes[i]); +		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);  		struct bkey *k, *last = NULL;  		keys = 0; -		if (i == 1) { +		if (i > 1) { +			for (k = n2->start; +			     k < bset_bkey_last(n2); +			     k = bkey_next(k)) { +				if (__set_blocks(n1, n1->keys + keys + +						 bkey_u64s(k), +						 block_bytes(b->c)) > blocks) +					break; + +				last = k; +				keys += bkey_u64s(k); +			} +		} else {  			/*  			 * Last node we're not getting rid of - we're getting  			 * rid of the node at r[0]. Have to try and fit all of @@ -1241,133 +1356,226 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,  			 * length keys (shouldn't be possible in practice,  			 * though)  			 */ -			if (__set_blocks(n1, n1->keys + r->keys, -					 b->c) > btree_blocks(r[i].b)) -				return; +			if (__set_blocks(n1, n1->keys + n2->keys, +					 block_bytes(b->c)) > +			    btree_blocks(new_nodes[i])) +				goto out_nocoalesce;  			keys = n2->keys; +			/* Take the key of the node we're getting rid of */  			last = &r->b->key; -		} else -			for (k = n2->start; -			     k < end(n2); -			     k = bkey_next(k)) { -				if (__set_blocks(n1, n1->keys + keys + -						 bkey_u64s(k), b->c) > blocks) -					break; - -				last = k; -				keys += bkey_u64s(k); -			} +		} -		BUG_ON(__set_blocks(n1, n1->keys + keys, -				    b->c) > btree_blocks(r[i].b)); +		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > +		       btree_blocks(new_nodes[i])); -		if (last) { -			bkey_copy_key(&r[i].b->key, last); -			bkey_copy_key(r[i].k, last); -		} +		if (last) +			bkey_copy_key(&new_nodes[i]->key, last); -		memcpy(end(n1), +		memcpy(bset_bkey_last(n1),  		       n2->start, -		       (void *) node(n2, keys) - (void *) n2->start); +		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);  		n1->keys += keys; +		r[i].keys = n1->keys;  		memmove(n2->start, -			node(n2, keys), -			(void *) end(n2) - (void *) node(n2, keys)); +			bset_bkey_idx(n2, keys), +			(void *) bset_bkey_last(n2) - +			(void *) bset_bkey_idx(n2, keys));  		n2->keys -= keys; -		r[i].keys	= n1->keys; -		r[i - 1].keys	= n2->keys; +		if (__bch_keylist_realloc(&keylist, +					  bkey_u64s(&new_nodes[i]->key))) +			goto out_nocoalesce; + +		bch_btree_node_write(new_nodes[i], &cl); +		bch_keylist_add(&keylist, &new_nodes[i]->key);  	} -	btree_node_free(r->b, op); -	up_write(&r->b->lock); +	for (i = 0; i < nodes; i++) +		mutex_unlock(&new_nodes[i]->write_lock); -	trace_bcache_btree_gc_coalesce(nodes); +	closure_sync(&cl); + +	/* We emptied out this node */ +	BUG_ON(btree_bset_first(new_nodes[0])->keys); +	btree_node_free(new_nodes[0]); +	rw_unlock(true, new_nodes[0]); +	for (i = 0; i < nodes; i++) { +		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) +			goto out_nocoalesce; + +		make_btree_freeing_key(r[i].b, keylist.top); +		bch_keylist_push(&keylist); +	} + +	bch_btree_insert_node(b, op, &keylist, NULL, NULL); +	BUG_ON(!bch_keylist_empty(&keylist)); + +	for (i = 0; i < nodes; i++) { +		btree_node_free(r[i].b); +		rw_unlock(true, r[i].b); + +		r[i].b = new_nodes[i]; +	} + +	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); +	r[nodes - 1].b = ERR_PTR(-EINTR); + +	trace_bcache_btree_gc_coalesce(nodes);  	gc->nodes--; -	nodes--; -	memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); -	memset(&r[nodes], 0, sizeof(struct gc_merge_info)); +	bch_keylist_free(&keylist); + +	/* Invalidated our iterator */ +	return -EINTR; + +out_nocoalesce: +	closure_sync(&cl); +	bch_keylist_free(&keylist); + +	while ((k = bch_keylist_pop(&keylist))) +		if (!bkey_cmp(k, &ZERO_KEY)) +			atomic_dec(&b->c->prio_blocked); + +	for (i = 0; i < nodes; i++) +		if (!IS_ERR_OR_NULL(new_nodes[i])) { +			btree_node_free(new_nodes[i]); +			rw_unlock(true, new_nodes[i]); +		} +	return 0;  } -static int btree_gc_recurse(struct btree *b, struct btree_op *op, -			    struct closure *writes, struct gc_stat *gc) +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, +				 struct btree *replace)  { -	void write(struct btree *r) -	{ -		if (!r->written) -			bch_btree_node_write(r, &op->cl); -		else if (btree_node_dirty(r)) -			bch_btree_node_write(r, writes); +	struct keylist keys; +	struct btree *n; + +	if (btree_check_reserve(b, NULL)) +		return 0; -		up_write(&r->lock); +	n = btree_node_alloc_replacement(replace, NULL); + +	/* recheck reserve after allocating replacement node */ +	if (btree_check_reserve(b, NULL)) { +		btree_node_free(n); +		rw_unlock(true, n); +		return 0;  	} -	int ret = 0, stale; -	unsigned i; -	struct gc_merge_info r[GC_MERGE_NODES]; +	bch_btree_node_write_sync(n); -	memset(r, 0, sizeof(r)); +	bch_keylist_init(&keys); +	bch_keylist_add(&keys, &n->key); -	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { -		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); +	make_btree_freeing_key(replace, keys.top); +	bch_keylist_push(&keys); -		if (IS_ERR(r->b)) { -			ret = PTR_ERR(r->b); -			break; -		} +	bch_btree_insert_node(b, op, &keys, NULL, NULL); +	BUG_ON(!bch_keylist_empty(&keys)); -		r->keys	= 0; -		stale = btree_gc_mark_node(r->b, &r->keys, gc); +	btree_node_free(replace); +	rw_unlock(true, n); -		if (!b->written && -		    (r->b->level || stale > 10 || -		     b->c->gc_always_rewrite)) -			r->b = btree_gc_alloc(r->b, r->k, op); +	/* Invalidated our iterator */ +	return -EINTR; +} + +static unsigned btree_gc_count_keys(struct btree *b) +{ +	struct bkey *k; +	struct btree_iter iter; +	unsigned ret = 0; -		if (r->b->level) -			ret = btree_gc_recurse(r->b, op, writes, gc); +	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) +		ret += bkey_u64s(k); -		if (ret) { -			write(r->b); -			break; +	return ret; +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, +			    struct closure *writes, struct gc_stat *gc) +{ +	int ret = 0; +	bool should_rewrite; +	struct bkey *k; +	struct btree_iter iter; +	struct gc_merge_info r[GC_MERGE_NODES]; +	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; + +	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + +	for (i = r; i < r + ARRAY_SIZE(r); i++) +		i->b = ERR_PTR(-EINTR); + +	while (1) { +		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); +		if (k) { +			r->b = bch_btree_node_get(b->c, op, k, b->level - 1, +						  true); +			if (IS_ERR(r->b)) { +				ret = PTR_ERR(r->b); +				break; +			} + +			r->keys = btree_gc_count_keys(r->b); + +			ret = btree_gc_coalesce(b, op, gc, r); +			if (ret) +				break;  		} -		bkey_copy_key(&b->c->gc_done, r->k); +		if (!last->b) +			break; -		if (!b->written) -			btree_gc_coalesce(b, op, gc, r); +		if (!IS_ERR(last->b)) { +			should_rewrite = btree_gc_mark_node(last->b, gc); +			if (should_rewrite) { +				ret = btree_gc_rewrite_node(b, op, last->b); +				if (ret) +					break; +			} -		if (r[GC_MERGE_NODES - 1].b) -			write(r[GC_MERGE_NODES - 1].b); +			if (last->b->level) { +				ret = btree_gc_recurse(last->b, op, writes, gc); +				if (ret) +					break; +			} -		memmove(&r[1], &r[0], -			sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); +			bkey_copy_key(&b->c->gc_done, &last->b->key); + +			/* +			 * Must flush leaf nodes before gc ends, since replace +			 * operations aren't journalled +			 */ +			mutex_lock(&last->b->write_lock); +			if (btree_node_dirty(last->b)) +				bch_btree_node_write(last->b, writes); +			mutex_unlock(&last->b->write_lock); +			rw_unlock(true, last->b); +		} + +		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); +		r->b = NULL; -		/* When we've got incremental GC working, we'll want to do -		 * if (should_resched()) -		 *	return -EAGAIN; -		 */ -		cond_resched(); -#if 0  		if (need_resched()) {  			ret = -EAGAIN;  			break;  		} -#endif  	} -	for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) -		write(r[i].b); - -	/* Might have freed some children, must remove their keys */ -	if (!b->written) -		bch_btree_sort(b); +	for (i = r; i < r + ARRAY_SIZE(r); i++) +		if (!IS_ERR_OR_NULL(i->b)) { +			mutex_lock(&i->b->write_lock); +			if (btree_node_dirty(i->b)) +				bch_btree_node_write(i->b, writes); +			mutex_unlock(&i->b->write_lock); +			rw_unlock(true, i->b); +		}  	return ret;  } @@ -1376,29 +1584,34 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,  			     struct closure *writes, struct gc_stat *gc)  {  	struct btree *n = NULL; -	unsigned keys = 0; -	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); +	int ret = 0; +	bool should_rewrite; -	if (b->level || stale > 10) +	should_rewrite = btree_gc_mark_node(b, gc); +	if (should_rewrite) {  		n = btree_node_alloc_replacement(b, NULL); -	if (!IS_ERR_OR_NULL(n)) -		swap(b, n); +		if (!IS_ERR_OR_NULL(n)) { +			bch_btree_node_write_sync(n); -	if (b->level) -		ret = btree_gc_recurse(b, op, writes, gc); +			bch_btree_set_root(n); +			btree_node_free(b); +			rw_unlock(true, n); -	if (!b->written || btree_node_dirty(b)) { -		bch_btree_node_write(b, n ? &op->cl : NULL); +			return -EINTR; +		}  	} -	if (!IS_ERR_OR_NULL(n)) { -		closure_sync(&op->cl); -		bch_btree_set_root(b); -		btree_node_free(n, op); -		rw_unlock(true, b); +	__bch_btree_mark_key(b->c, b->level + 1, &b->key); + +	if (b->level) { +		ret = btree_gc_recurse(b, op, writes, gc); +		if (ret) +			return ret;  	} +	bkey_copy_key(&b->c->gc_done, &b->key); +  	return ret;  } @@ -1418,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)  	for_each_cache(ca, c, i)  		for_each_bucket(b, ca) { -			b->gc_gen = b->gen; +			b->last_gc = b->gen;  			if (!atomic_read(&b->pin)) { -				SET_GC_MARK(b, GC_MARK_RECLAIMABLE); +				SET_GC_MARK(b, 0);  				SET_GC_SECTORS_USED(b, 0);  			}  		} @@ -1428,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)  	mutex_unlock(&c->bucket_lock);  } -size_t bch_btree_gc_finish(struct cache_set *c) +static size_t bch_btree_gc_finish(struct cache_set *c)  {  	size_t available = 0;  	struct bucket *b; @@ -1441,15 +1654,32 @@ size_t bch_btree_gc_finish(struct cache_set *c)  	c->gc_mark_valid = 1;  	c->need_gc	= 0; -	if (c->root) -		for (i = 0; i < KEY_PTRS(&c->root->key); i++) -			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), -				    GC_MARK_METADATA); -  	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)  		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),  			    GC_MARK_METADATA); +	/* don't reclaim buckets to which writeback keys point */ +	rcu_read_lock(); +	for (i = 0; i < c->nr_uuids; i++) { +		struct bcache_device *d = c->devices[i]; +		struct cached_dev *dc; +		struct keybuf_key *w, *n; +		unsigned j; + +		if (!d || UUID_FLASH_ONLY(&c->uuids[i])) +			continue; +		dc = container_of(d, struct cached_dev, disk); + +		spin_lock(&dc->writeback_keys.lock); +		rbtree_postorder_for_each_entry_safe(w, n, +					&dc->writeback_keys.keys, node) +			for (j = 0; j < KEY_PTRS(&w->key); j++) +				SET_GC_MARK(PTR_BUCKET(c, &w->key, j), +					    GC_MARK_DIRTY); +		spin_unlock(&dc->writeback_keys.lock); +	} +	rcu_read_unlock(); +  	for_each_cache(ca, c, i) {  		uint64_t *i; @@ -1463,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)  			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);  		for_each_bucket(b, ca) { -			b->last_gc	= b->gc_gen;  			c->need_gc	= max(c->need_gc, bucket_gc_gen(b)); -			if (!atomic_read(&b->pin) && -			    GC_MARK(b) == GC_MARK_RECLAIMABLE) { +			if (atomic_read(&b->pin)) +				continue; + +			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + +			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)  				available++; -				if (!GC_SECTORS_USED(b)) -					bch_bucket_add_unused(ca, b); -			}  		}  	} @@ -1479,9 +1709,8 @@ size_t bch_btree_gc_finish(struct cache_set *c)  	return available;  } -static void bch_btree_gc(struct closure *cl) +static void bch_btree_gc(struct cache_set *c)  { -	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);  	int ret;  	unsigned long available;  	struct gc_stat stats; @@ -1493,632 +1722,505 @@ static void bch_btree_gc(struct closure *cl)  	memset(&stats, 0, sizeof(struct gc_stat));  	closure_init_stack(&writes); -	bch_btree_op_init_stack(&op); -	op.lock = SHRT_MAX; +	bch_btree_op_init(&op, SHRT_MAX);  	btree_gc_start(c); -	atomic_inc(&c->prio_blocked); - -	ret = btree_root(gc_root, c, &op, &writes, &stats); -	closure_sync(&op.cl); -	closure_sync(&writes); - -	if (ret) { -		pr_warn("gc failed!"); -		continue_at(cl, bch_btree_gc, bch_gc_wq); -	} +	do { +		ret = btree_root(gc_root, c, &op, &writes, &stats); +		closure_sync(&writes); -	/* Possibly wait for new UUIDs or whatever to hit disk */ -	bch_journal_meta(c, &op.cl); -	closure_sync(&op.cl); +		if (ret && ret != -EAGAIN) +			pr_warn("gc failed!"); +	} while (ret);  	available = bch_btree_gc_finish(c); - -	atomic_dec(&c->prio_blocked);  	wake_up_allocators(c);  	bch_time_stats_update(&c->btree_gc_time, start_time);  	stats.key_bytes *= sizeof(uint64_t); -	stats.dirty	<<= 9;  	stats.data	<<= 9;  	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;  	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));  	trace_bcache_gc_end(c); -	continue_at(cl, bch_moving_gc, bch_gc_wq); +	bch_moving_gc(c);  } -void bch_queue_gc(struct cache_set *c) -{ -	closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); -} - -/* Initial partial gc */ - -static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, -				   unsigned long **seen) +static int bch_gc_thread(void *arg)  { -	int ret; +	struct cache_set *c = arg; +	struct cache *ca;  	unsigned i; -	struct bkey *k; -	struct bucket *g; -	struct btree_iter iter; -	for_each_key_filter(b, k, &iter, bch_ptr_invalid) { -		for (i = 0; i < KEY_PTRS(k); i++) { -			if (!ptr_available(b->c, k, i)) -				continue; +	while (1) { +again: +		bch_btree_gc(c); -			g = PTR_BUCKET(b->c, k, i); +		set_current_state(TASK_INTERRUPTIBLE); +		if (kthread_should_stop()) +			break; -			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), -						seen[PTR_DEV(k, i)]) || -			    !ptr_stale(b->c, k, i)) { -				g->gen = PTR_GEN(k, i); +		mutex_lock(&c->bucket_lock); -				if (b->level) -					g->prio = BTREE_PRIO; -				else if (g->prio == BTREE_PRIO) -					g->prio = INITIAL_PRIO; +		for_each_cache(ca, c, i) +			if (ca->invalidate_needs_gc) { +				mutex_unlock(&c->bucket_lock); +				set_current_state(TASK_RUNNING); +				goto again;  			} -		} - -		btree_mark_key(b, k); -	} - -	if (b->level) { -		k = bch_next_recurse_key(b, &ZERO_KEY); -		while (k) { -			struct bkey *p = bch_next_recurse_key(b, k); -			if (p) -				btree_node_prefetch(b->c, p, b->level - 1); - -			ret = btree(check_recurse, k, b, op, seen); -			if (ret) -				return ret; +		mutex_unlock(&c->bucket_lock); -			k = p; -		} +		try_to_freeze(); +		schedule();  	}  	return 0;  } -int bch_btree_check(struct cache_set *c, struct btree_op *op) +int bch_gc_thread_start(struct cache_set *c)  { -	int ret = -ENOMEM; -	unsigned i; -	unsigned long *seen[MAX_CACHES_PER_SET]; +	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); +	if (IS_ERR(c->gc_thread)) +		return PTR_ERR(c->gc_thread); -	memset(seen, 0, sizeof(seen)); - -	for (i = 0; c->cache[i]; i++) { -		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); -		seen[i] = kmalloc(n, GFP_KERNEL); -		if (!seen[i]) -			goto err; - -		/* Disables the seen array until prio_read() uses it too */ -		memset(seen[i], 0xFF, n); -	} - -	ret = btree_root(check_recurse, c, op, seen); -err: -	for (i = 0; i < MAX_CACHES_PER_SET; i++) -		kfree(seen[i]); -	return ret; +	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); +	return 0;  } -/* Btree insertion */ - -static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) -{ -	struct bset *i = b->sets[b->nsets].data; - -	memmove((uint64_t *) where + bkey_u64s(insert), -		where, -		(void *) end(i) - (void *) where); - -	i->keys += bkey_u64s(insert); -	bkey_copy(where, insert); -	bch_bset_fix_lookup_table(b, where); -} +/* Initial partial gc */ -static bool fix_overlapping_extents(struct btree *b, -				    struct bkey *insert, -				    struct btree_iter *iter, -				    struct btree_op *op) +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)  { -	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) -	{ -		if (KEY_DIRTY(k)) -			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), -						     offset, -sectors); -	} - -	uint64_t old_offset; -	unsigned old_size, sectors_found = 0; - -	while (1) { -		struct bkey *k = bch_btree_iter_next(iter); -		if (!k || -		    bkey_cmp(&START_KEY(k), insert) >= 0) -			break; +	int ret = 0; +	struct bkey *k, *p = NULL; +	struct btree_iter iter; -		if (bkey_cmp(k, &START_KEY(insert)) <= 0) -			continue; +	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) +		bch_initial_mark_key(b->c, b->level, k); -		old_offset = KEY_START(k); -		old_size = KEY_SIZE(k); +	bch_initial_mark_key(b->c, b->level + 1, &b->key); -		/* -		 * We might overlap with 0 size extents; we can't skip these -		 * because if they're in the set we're inserting to we have to -		 * adjust them so they don't overlap with the key we're -		 * inserting. But we don't want to check them for BTREE_REPLACE -		 * operations. -		 */ +	if (b->level) { +		bch_btree_iter_init(&b->keys, &iter, NULL); -		if (op->type == BTREE_REPLACE && -		    KEY_SIZE(k)) { -			/* -			 * k might have been split since we inserted/found the -			 * key we're replacing -			 */ -			unsigned i; -			uint64_t offset = KEY_START(k) - -				KEY_START(&op->replace); +		do { +			k = bch_btree_iter_next_filter(&iter, &b->keys, +						       bch_ptr_bad); +			if (k) +				btree_node_prefetch(b->c, k, b->level - 1); -			/* But it must be a subset of the replace key */ -			if (KEY_START(k) < KEY_START(&op->replace) || -			    KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) -				goto check_failed; +			if (p) +				ret = btree(check_recurse, p, b, op); -			/* We didn't find a key that we were supposed to */ -			if (KEY_START(k) > KEY_START(insert) + sectors_found) -				goto check_failed; +			p = k; +		} while (p && !ret); +	} -			if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) -				goto check_failed; +	return ret; +} -			/* skip past gen */ -			offset <<= 8; +int bch_btree_check(struct cache_set *c) +{ +	struct btree_op op; -			BUG_ON(!KEY_PTRS(&op->replace)); +	bch_btree_op_init(&op, SHRT_MAX); -			for (i = 0; i < KEY_PTRS(&op->replace); i++) -				if (k->ptr[i] != op->replace.ptr[i] + offset) -					goto check_failed; +	return btree_root(check_recurse, c, &op); +} -			sectors_found = KEY_OFFSET(k) - KEY_START(insert); -		} +void bch_initial_gc_finish(struct cache_set *c) +{ +	struct cache *ca; +	struct bucket *b; +	unsigned i; -		if (bkey_cmp(insert, k) < 0 && -		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { -			/* -			 * We overlapped in the middle of an existing key: that -			 * means we have to split the old key. But we have to do -			 * slightly different things depending on whether the -			 * old key has been written out yet. -			 */ +	bch_btree_gc_finish(c); -			struct bkey *top; - -			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - -			if (bkey_written(b, k)) { -				/* -				 * We insert a new key to cover the top of the -				 * old key, and the old key is modified in place -				 * to represent the bottom split. -				 * -				 * It's completely arbitrary whether the new key -				 * is the top or the bottom, but it has to match -				 * up with what btree_sort_fixup() does - it -				 * doesn't check for this kind of overlap, it -				 * depends on us inserting a new key for the top -				 * here. -				 */ -				top = bch_bset_search(b, &b->sets[b->nsets], -						      insert); -				shift_keys(b, top, k); -			} else { -				BKEY_PADDED(key) temp; -				bkey_copy(&temp.key, k); -				shift_keys(b, k, &temp.key); -				top = bkey_next(k); -			} +	mutex_lock(&c->bucket_lock); -			bch_cut_front(insert, top); -			bch_cut_back(&START_KEY(insert), k); -			bch_bset_fix_invalidated_key(b, k); -			return false; -		} +	/* +	 * We need to put some unused buckets directly on the prio freelist in +	 * order to get the allocator thread started - it needs freed buckets in +	 * order to rewrite the prios and gens, and it needs to rewrite prios +	 * and gens in order to free buckets. +	 * +	 * This is only safe for buckets that have no live data in them, which +	 * there should always be some of. +	 */ +	for_each_cache(ca, c, i) { +		for_each_bucket(b, ca) { +			if (fifo_full(&ca->free[RESERVE_PRIO])) +				break; -		if (bkey_cmp(insert, k) < 0) { -			bch_cut_front(insert, k); -		} else { -			if (bkey_written(b, k) && -			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { -				/* -				 * Completely overwrote, so we don't have to -				 * invalidate the binary search tree -				 */ -				bch_cut_front(k, k); -			} else { -				__bch_cut_back(&START_KEY(insert), k); -				bch_bset_fix_invalidated_key(b, k); +			if (bch_can_invalidate_bucket(ca, b) && +			    !GC_MARK(b)) { +				__bch_invalidate_one_bucket(ca, b); +				fifo_push(&ca->free[RESERVE_PRIO], +					  b - ca->buckets);  			}  		} - -		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));  	} -check_failed: -	if (op->type == BTREE_REPLACE) { -		if (!sectors_found) { -			op->insert_collision = true; -			return true; -		} else if (sectors_found < KEY_SIZE(insert)) { -			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - -				       (KEY_SIZE(insert) - sectors_found)); -			SET_KEY_SIZE(insert, sectors_found); -		} -	} - -	return false; +	mutex_unlock(&c->bucket_lock);  } -static bool btree_insert_key(struct btree *b, struct btree_op *op, -			     struct bkey *k) +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, +			     struct bkey *replace_key)  { -	struct bset *i = b->sets[b->nsets].data; -	struct bkey *m, *prev; -	unsigned status = BTREE_INSERT_STATUS_INSERT; +	unsigned status;  	BUG_ON(bkey_cmp(k, &b->key) > 0); -	BUG_ON(b->level && !KEY_PTRS(k)); -	BUG_ON(!b->level && !KEY_OFFSET(k)); -	if (!b->level) { -		struct btree_iter iter; -		struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); +	status = bch_btree_insert_key(&b->keys, k, replace_key); +	if (status != BTREE_INSERT_STATUS_NO_INSERT) { +		bch_check_keys(&b->keys, "%u for %s", status, +			       replace_key ? "replace" : "insert"); -		/* -		 * bset_search() returns the first key that is strictly greater -		 * than the search key - but for back merging, we want to find -		 * the first key that is greater than or equal to KEY_START(k) - -		 * unless KEY_START(k) is 0. -		 */ -		if (KEY_OFFSET(&search)) -			SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); - -		prev = NULL; -		m = bch_btree_iter_init(b, &iter, &search); - -		if (fix_overlapping_extents(b, k, &iter, op)) -			return false; - -		while (m != end(i) && -		       bkey_cmp(k, &START_KEY(m)) > 0) -			prev = m, m = bkey_next(m); - -		if (key_merging_disabled(b->c)) -			goto insert; - -		/* prev is in the tree, if we merge we're done */ -		status = BTREE_INSERT_STATUS_BACK_MERGE; -		if (prev && -		    bch_bkey_try_merge(b, prev, k)) -			goto merged; - -		status = BTREE_INSERT_STATUS_OVERWROTE; -		if (m != end(i) && -		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) -			goto copy; - -		status = BTREE_INSERT_STATUS_FRONT_MERGE; -		if (m != end(i) && -		    bch_bkey_try_merge(b, k, m)) -			goto copy; +		trace_bcache_btree_insert_key(b, k, replace_key != NULL, +					      status); +		return true;  	} else -		m = bch_bset_search(b, &b->sets[b->nsets], k); - -insert:	shift_keys(b, m, k); -copy:	bkey_copy(m, k); -merged: -	if (KEY_DIRTY(k)) -		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), -					     KEY_START(k), KEY_SIZE(k)); - -	bch_check_keys(b, "%u for %s", status, op_type(op)); - -	if (b->level && !KEY_OFFSET(k)) -		btree_current_write(b)->prio_blocked++; - -	trace_bcache_btree_insert_key(b, k, op->type, status); - -	return true; +		return false;  } -static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static size_t insert_u64s_remaining(struct btree *b)  { -	bool ret = false; -	struct bkey *k; -	unsigned oldsize = bch_count_data(b); +	long ret = bch_btree_keys_u64s_remaining(&b->keys); -	while ((k = bch_keylist_pop(&op->keys))) { -		bkey_put(b->c, k, b->level); -		ret |= btree_insert_key(b, op, k); -	} +	/* +	 * Might land in the middle of an existing extent and have to split it +	 */ +	if (b->keys.ops->is_extents) +		ret -= KEY_MAX_U64S; -	BUG_ON(bch_count_data(b) < oldsize); -	return ret; +	return max(ret, 0L);  } -bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, -				   struct bio *bio) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, +				  struct keylist *insert_keys, +				  struct bkey *replace_key)  {  	bool ret = false; -	uint64_t btree_ptr = b->key.ptr[0]; -	unsigned long seq = b->seq; -	BKEY_PADDED(k) tmp; +	int oldsize = bch_count_data(&b->keys); -	rw_unlock(false, b); -	rw_lock(true, b, b->level); +	while (!bch_keylist_empty(insert_keys)) { +		struct bkey *k = insert_keys->keys; -	if (b->key.ptr[0] != btree_ptr || -	    b->seq != seq + 1 || -	    should_split(b)) -		goto out; +		if (bkey_u64s(k) > insert_u64s_remaining(b)) +			break; + +		if (bkey_cmp(k, &b->key) <= 0) { +			if (!b->level) +				bkey_put(b->c, k); + +			ret |= btree_insert_key(b, k, replace_key); +			bch_keylist_pop_front(insert_keys); +		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { +			BKEY_PADDED(key) temp; +			bkey_copy(&temp.key, insert_keys->keys); -	op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); +			bch_cut_back(&b->key, &temp.key); +			bch_cut_front(&b->key, insert_keys->keys); -	SET_KEY_PTRS(&op->replace, 1); -	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); +			ret |= btree_insert_key(b, &temp.key, replace_key); +			break; +		} else { +			break; +		} +	} -	SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); +	if (!ret) +		op->insert_collision = true; -	bkey_copy(&tmp.k, &op->replace); +	BUG_ON(!bch_keylist_empty(insert_keys) && b->level); -	BUG_ON(op->type != BTREE_INSERT); -	BUG_ON(!btree_insert_key(b, op, &tmp.k)); -	ret = true; -out: -	downgrade_write(&b->lock); +	BUG_ON(bch_count_data(&b->keys) < oldsize);  	return ret;  } -static int btree_split(struct btree *b, struct btree_op *op) +static int btree_split(struct btree *b, struct btree_op *op, +		       struct keylist *insert_keys, +		       struct bkey *replace_key)  { -	bool split, root = b == b->c->root; +	bool split;  	struct btree *n1, *n2 = NULL, *n3 = NULL;  	uint64_t start_time = local_clock(); +	struct closure cl; +	struct keylist parent_keys; -	if (b->level) -		set_closure_blocking(&op->cl); +	closure_init_stack(&cl); +	bch_keylist_init(&parent_keys); -	n1 = btree_node_alloc_replacement(b, &op->cl); +	if (btree_check_reserve(b, op)) { +		if (!b->level) +			return -EINTR; +		else +			WARN(1, "insufficient reserve for split\n"); +	} + +	n1 = btree_node_alloc_replacement(b, op);  	if (IS_ERR(n1))  		goto err; -	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; +	split = set_blocks(btree_bset_first(n1), +			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;  	if (split) {  		unsigned keys = 0; -		trace_bcache_btree_node_split(b, n1->sets[0].data->keys); +		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); -		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); +		n2 = bch_btree_node_alloc(b->c, op, b->level);  		if (IS_ERR(n2))  			goto err_free1; -		if (root) { -			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); +		if (!b->parent) { +			n3 = bch_btree_node_alloc(b->c, op, b->level + 1);  			if (IS_ERR(n3))  				goto err_free2;  		} -		bch_btree_insert_keys(n1, op); +		mutex_lock(&n1->write_lock); +		mutex_lock(&n2->write_lock); + +		bch_btree_insert_keys(n1, op, insert_keys, replace_key); -		/* Has to be a linear search because we don't have an auxiliary +		/* +		 * Has to be a linear search because we don't have an auxiliary  		 * search tree yet  		 */ -		while (keys < (n1->sets[0].data->keys * 3) / 5) -			keys += bkey_u64s(node(n1->sets[0].data, keys)); +		while (keys < (btree_bset_first(n1)->keys * 3) / 5) +			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), +							keys)); -		bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); -		keys += bkey_u64s(node(n1->sets[0].data, keys)); +		bkey_copy_key(&n1->key, +			      bset_bkey_idx(btree_bset_first(n1), keys)); +		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); -		n2->sets[0].data->keys = n1->sets[0].data->keys - keys; -		n1->sets[0].data->keys = keys; +		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; +		btree_bset_first(n1)->keys = keys; -		memcpy(n2->sets[0].data->start, -		       end(n1->sets[0].data), -		       n2->sets[0].data->keys * sizeof(uint64_t)); +		memcpy(btree_bset_first(n2)->start, +		       bset_bkey_last(btree_bset_first(n1)), +		       btree_bset_first(n2)->keys * sizeof(uint64_t));  		bkey_copy_key(&n2->key, &b->key); -		bch_keylist_add(&op->keys, &n2->key); -		bch_btree_node_write(n2, &op->cl); +		bch_keylist_add(&parent_keys, &n2->key); +		bch_btree_node_write(n2, &cl); +		mutex_unlock(&n2->write_lock);  		rw_unlock(true, n2);  	} else { -		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); +		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); -		bch_btree_insert_keys(n1, op); +		mutex_lock(&n1->write_lock); +		bch_btree_insert_keys(n1, op, insert_keys, replace_key);  	} -	bch_keylist_add(&op->keys, &n1->key); -	bch_btree_node_write(n1, &op->cl); +	bch_keylist_add(&parent_keys, &n1->key); +	bch_btree_node_write(n1, &cl); +	mutex_unlock(&n1->write_lock);  	if (n3) { +		/* Depth increases, make a new root */ +		mutex_lock(&n3->write_lock);  		bkey_copy_key(&n3->key, &MAX_KEY); -		bch_btree_insert_keys(n3, op); -		bch_btree_node_write(n3, &op->cl); +		bch_btree_insert_keys(n3, op, &parent_keys, NULL); +		bch_btree_node_write(n3, &cl); +		mutex_unlock(&n3->write_lock); -		closure_sync(&op->cl); +		closure_sync(&cl);  		bch_btree_set_root(n3);  		rw_unlock(true, n3); -	} else if (root) { -		op->keys.top = op->keys.bottom; -		closure_sync(&op->cl); +	} else if (!b->parent) { +		/* Root filled up but didn't need to be split */ +		closure_sync(&cl);  		bch_btree_set_root(n1);  	} else { -		unsigned i; - -		bkey_copy(op->keys.top, &b->key); -		bkey_copy_key(op->keys.top, &ZERO_KEY); - -		for (i = 0; i < KEY_PTRS(&b->key); i++) { -			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; +		/* Split a non root node */ +		closure_sync(&cl); +		make_btree_freeing_key(b, parent_keys.top); +		bch_keylist_push(&parent_keys); -			SET_PTR_GEN(op->keys.top, i, g); -		} - -		bch_keylist_push(&op->keys); -		closure_sync(&op->cl); -		atomic_inc(&b->c->prio_blocked); +		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); +		BUG_ON(!bch_keylist_empty(&parent_keys));  	} +	btree_node_free(b);  	rw_unlock(true, n1); -	btree_node_free(b, op);  	bch_time_stats_update(&b->c->btree_split_time, start_time);  	return 0;  err_free2: -	__bkey_put(n2->c, &n2->key); -	btree_node_free(n2, op); +	bkey_put(b->c, &n2->key); +	btree_node_free(n2);  	rw_unlock(true, n2);  err_free1: -	__bkey_put(n1->c, &n1->key); -	btree_node_free(n1, op); +	bkey_put(b->c, &n1->key); +	btree_node_free(n1);  	rw_unlock(true, n1);  err: +	WARN(1, "bcache: btree split failed (level %u)", b->level); +  	if (n3 == ERR_PTR(-EAGAIN) ||  	    n2 == ERR_PTR(-EAGAIN) ||  	    n1 == ERR_PTR(-EAGAIN))  		return -EAGAIN; -	pr_warn("couldn't split");  	return -ENOMEM;  } -static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, -				    struct keylist *stack_keys) +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, +				 struct keylist *insert_keys, +				 atomic_t *journal_ref, +				 struct bkey *replace_key)  { -	if (b->level) { -		int ret; -		struct bkey *insert = op->keys.bottom; -		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); +	struct closure cl; -		if (!k) { -			btree_bug(b, "no key to recurse on at level %i/%i", -				  b->level, b->c->root->level); +	BUG_ON(b->level && replace_key); -			op->keys.top = op->keys.bottom; -			return -EIO; -		} +	closure_init_stack(&cl); -		if (bkey_cmp(insert, k) > 0) { -			unsigned i; +	mutex_lock(&b->write_lock); -			if (op->type == BTREE_REPLACE) { -				__bkey_put(b->c, insert); -				op->keys.top = op->keys.bottom; -				op->insert_collision = true; -				return 0; -			} +	if (write_block(b) != btree_bset_last(b) && +	    b->keys.last_set_unwritten) +		bch_btree_init_next(b); /* just wrote a set */ -			for (i = 0; i < KEY_PTRS(insert); i++) -				atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); +	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { +		mutex_unlock(&b->write_lock); +		goto split; +	} -			bkey_copy(stack_keys->top, insert); +	BUG_ON(write_block(b) != btree_bset_last(b)); -			bch_cut_back(k, insert); -			bch_cut_front(k, stack_keys->top); +	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { +		if (!b->level) +			bch_btree_leaf_dirty(b, journal_ref); +		else +			bch_btree_node_write(b, &cl); +	} -			bch_keylist_push(stack_keys); -		} +	mutex_unlock(&b->write_lock); -		ret = btree(insert_recurse, k, b, op, stack_keys); -		if (ret) -			return ret; +	/* wait for btree node write if necessary, after unlock */ +	closure_sync(&cl); + +	return 0; +split: +	if (current->bio_list) { +		op->lock = b->c->root->level + 1; +		return -EAGAIN; +	} else if (op->lock <= b->c->root->level) { +		op->lock = b->c->root->level + 1; +		return -EINTR; +	} else { +		/* Invalidated all iterators */ +		int ret = btree_split(b, op, insert_keys, replace_key); + +		if (bch_keylist_empty(insert_keys)) +			return 0; +		else if (!ret) +			return -EINTR; +		return ret;  	} +} -	if (!bch_keylist_empty(&op->keys)) { -		if (should_split(b)) { -			if (op->lock <= b->c->root->level) { -				BUG_ON(b->level); -				op->lock = b->c->root->level + 1; -				return -EINTR; -			} -			return btree_split(b, op); -		} +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, +			       struct bkey *check_key) +{ +	int ret = -EINTR; +	uint64_t btree_ptr = b->key.ptr[0]; +	unsigned long seq = b->seq; +	struct keylist insert; +	bool upgrade = op->lock == -1; -		BUG_ON(write_block(b) != b->sets[b->nsets].data); +	bch_keylist_init(&insert); -		if (bch_btree_insert_keys(b, op)) { -			if (!b->level) -				bch_btree_leaf_dirty(b, op); -			else -				bch_btree_node_write(b, &op->cl); -		} +	if (upgrade) { +		rw_unlock(false, b); +		rw_lock(true, b, b->level); + +		if (b->key.ptr[0] != btree_ptr || +		    b->seq != seq + 1) +			goto out;  	} -	return 0; +	SET_KEY_PTRS(check_key, 1); +	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + +	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + +	bch_keylist_add(&insert, check_key); + +	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + +	BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: +	if (upgrade) +		downgrade_write(&b->lock); +	return ret;  } -int bch_btree_insert(struct btree_op *op, struct cache_set *c) +struct btree_insert_op { +	struct btree_op	op; +	struct keylist	*keys; +	atomic_t	*journal_ref; +	struct bkey	*replace_key; +}; + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b)  { -	int ret = 0; -	struct keylist stack_keys; +	struct btree_insert_op *op = container_of(b_op, +					struct btree_insert_op, op); -	/* -	 * Don't want to block with the btree locked unless we have to, -	 * otherwise we get deadlocks with try_harder and between split/gc -	 */ -	clear_closure_blocking(&op->cl); - -	BUG_ON(bch_keylist_empty(&op->keys)); -	bch_keylist_copy(&stack_keys, &op->keys); -	bch_keylist_init(&op->keys); - -	while (!bch_keylist_empty(&stack_keys) || -	       !bch_keylist_empty(&op->keys)) { -		if (bch_keylist_empty(&op->keys)) { -			bch_keylist_add(&op->keys, -					bch_keylist_pop(&stack_keys)); -			op->lock = 0; -		} +	int ret = bch_btree_insert_node(b, &op->op, op->keys, +					op->journal_ref, op->replace_key); +	if (ret && !bch_keylist_empty(op->keys)) +		return ret; +	else +		return MAP_DONE; +} -		ret = btree_root(insert_recurse, c, op, &stack_keys); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, +		     atomic_t *journal_ref, struct bkey *replace_key) +{ +	struct btree_insert_op op; +	int ret = 0; -		if (ret == -EAGAIN) { -			ret = 0; -			closure_sync(&op->cl); -		} else if (ret) { -			struct bkey *k; +	BUG_ON(current->bio_list); +	BUG_ON(bch_keylist_empty(keys)); + +	bch_btree_op_init(&op.op, 0); +	op.keys		= keys; +	op.journal_ref	= journal_ref; +	op.replace_key	= replace_key; + +	while (!ret && !bch_keylist_empty(keys)) { +		op.op.lock = 0; +		ret = bch_btree_map_leaf_nodes(&op.op, c, +					       &START_KEY(keys->keys), +					       btree_insert_fn); +	} -			pr_err("error %i trying to insert key for %s", -			       ret, op_type(op)); +	if (ret) { +		struct bkey *k; -			while ((k = bch_keylist_pop(&stack_keys) ?: -				    bch_keylist_pop(&op->keys))) -				bkey_put(c, k, 0); -		} -	} +		pr_err("error %i", ret); -	bch_keylist_free(&stack_keys); +		while ((k = bch_keylist_pop(keys))) +			bkey_put(c, k); +	} else if (op.op.insert_collision) +		ret = -ESRCH; -	if (op->journal) -		atomic_dec_bug(op->journal); -	op->journal = NULL;  	return ret;  } @@ -2141,132 +2243,81 @@ void bch_btree_set_root(struct btree *b)  	mutex_unlock(&b->c->bucket_lock);  	b->c->root = b; -	__bkey_put(b->c, &b->key);  	bch_journal_meta(b->c, &cl);  	closure_sync(&cl);  } -/* Cache lookup */ +/* Map across nodes or keys */ -static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, -				     struct bkey *k) +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, +				       struct bkey *from, +				       btree_map_nodes_fn *fn, int flags)  { -	struct search *s = container_of(op, struct search, op); -	struct bio *bio = &s->bio.bio; -	int ret = 0; +	int ret = MAP_CONTINUE; + +	if (b->level) { +		struct bkey *k; +		struct btree_iter iter; -	while (!ret && -	       !op->lookup_done) { -		unsigned sectors = INT_MAX; +		bch_btree_iter_init(&b->keys, &iter, from); -		if (KEY_INODE(k) == op->inode) { -			if (KEY_START(k) <= bio->bi_sector) -				break; +		while ((k = bch_btree_iter_next_filter(&iter, &b->keys, +						       bch_ptr_bad))) { +			ret = btree(map_nodes_recurse, k, b, +				    op, from, fn, flags); +			from = NULL; -			sectors = min_t(uint64_t, sectors, -					KEY_START(k) - bio->bi_sector); +			if (ret != MAP_CONTINUE) +				return ret;  		} - -		ret = s->d->cache_miss(b, s, bio, sectors);  	} +	if (!b->level || flags == MAP_ALL_NODES) +		ret = fn(op, b); +  	return ret;  } -/* - * Read from a single key, handling the initial cache miss if the key starts in - * the middle of the bio - */ -static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, -				    struct bkey *k) +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, +			  struct bkey *from, btree_map_nodes_fn *fn, int flags)  { -	struct search *s = container_of(op, struct search, op); -	struct bio *bio = &s->bio.bio; -	unsigned ptr; -	struct bio *n; - -	int ret = submit_partial_cache_miss(b, op, k); -	if (ret || op->lookup_done) -		return ret; - -	/* XXX: figure out best pointer - for multiple cache devices */ -	ptr = 0; - -	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; - -	while (!op->lookup_done && -	       KEY_INODE(k) == op->inode && -	       bio->bi_sector < KEY_OFFSET(k)) { -		struct bkey *bio_key; -		sector_t sector = PTR_OFFSET(k, ptr) + -			(bio->bi_sector - KEY_START(k)); -		unsigned sectors = min_t(uint64_t, INT_MAX, -					 KEY_OFFSET(k) - bio->bi_sector); - -		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); -		if (n == bio) -			op->lookup_done = true; - -		bio_key = &container_of(n, struct bbio, bio)->key; - -		/* -		 * The bucket we're reading from might be reused while our bio -		 * is in flight, and we could then end up reading the wrong -		 * data. -		 * -		 * We guard against this by checking (in cache_read_endio()) if -		 * the pointer is stale again; if so, we treat it as an error -		 * and reread from the backing device (but we don't pass that -		 * error up anywhere). -		 */ - -		bch_bkey_copy_single_ptr(bio_key, k, ptr); -		SET_PTR_OFFSET(bio_key, 0, sector); - -		n->bi_end_io	= bch_cache_read_endio; -		n->bi_private	= &s->cl; - -		__bch_submit_bbio(n, b->c); -	} - -	return 0; +	return btree_root(map_nodes_recurse, c, op, from, fn, flags);  } -int bch_btree_search_recurse(struct btree *b, struct btree_op *op) +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +				      struct bkey *from, btree_map_keys_fn *fn, +				      int flags)  { -	struct search *s = container_of(op, struct search, op); -	struct bio *bio = &s->bio.bio; - -	int ret = 0; +	int ret = MAP_CONTINUE;  	struct bkey *k;  	struct btree_iter iter; -	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); -	do { -		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); -		if (!k) { -			/* -			 * b->key would be exactly what we want, except that -			 * pointers to btree nodes have nonzero size - we -			 * wouldn't go far enough -			 */ +	bch_btree_iter_init(&b->keys, &iter, from); -			ret = submit_partial_cache_miss(b, op, -					&KEY(KEY_INODE(&b->key), -					     KEY_OFFSET(&b->key), 0)); -			break; -		} +	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { +		ret = !b->level +			? fn(op, b, k) +			: btree(map_keys_recurse, k, b, op, from, fn, flags); +		from = NULL; -		ret = b->level -			? btree(search_recurse, k, b, op) -			: submit_partial_cache_hit(b, op, k); -	} while (!ret && -		 !op->lookup_done); +		if (ret != MAP_CONTINUE) +			return ret; +	} + +	if (!b->level && (flags & MAP_END_KEY)) +		ret = fn(op, b, &KEY(KEY_INODE(&b->key), +				     KEY_OFFSET(&b->key), 0));  	return ret;  } +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, +		       struct bkey *from, btree_map_keys_fn *fn, int flags) +{ +	return btree_root(map_keys_recurse, c, op, from, fn, flags); +} +  /* Keybuf code */  static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) @@ -2285,80 +2336,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,  	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);  } -static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, -				   struct keybuf *buf, struct bkey *end, -				   keybuf_pred_fn *pred) -{ -	struct btree_iter iter; -	bch_btree_iter_init(b, &iter, &buf->last_scanned); - -	while (!array_freelist_empty(&buf->freelist)) { -		struct bkey *k = bch_btree_iter_next_filter(&iter, b, -							    bch_ptr_bad); - -		if (!b->level) { -			if (!k) { -				buf->last_scanned = b->key; -				break; -			} +struct refill { +	struct btree_op	op; +	unsigned	nr_found; +	struct keybuf	*buf; +	struct bkey	*end; +	keybuf_pred_fn	*pred; +}; -			buf->last_scanned = *k; -			if (bkey_cmp(&buf->last_scanned, end) >= 0) -				break; +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, +			    struct bkey *k) +{ +	struct refill *refill = container_of(op, struct refill, op); +	struct keybuf *buf = refill->buf; +	int ret = MAP_CONTINUE; -			if (pred(buf, k)) { -				struct keybuf_key *w; +	if (bkey_cmp(k, refill->end) >= 0) { +		ret = MAP_DONE; +		goto out; +	} -				spin_lock(&buf->lock); +	if (!KEY_SIZE(k)) /* end key */ +		goto out; -				w = array_alloc(&buf->freelist); +	if (refill->pred(buf, k)) { +		struct keybuf_key *w; -				w->private = NULL; -				bkey_copy(&w->key, k); +		spin_lock(&buf->lock); -				if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) -					array_free(&buf->freelist, w); +		w = array_alloc(&buf->freelist); +		if (!w) { +			spin_unlock(&buf->lock); +			return MAP_DONE; +		} -				spin_unlock(&buf->lock); -			} -		} else { -			if (!k) -				break; +		w->private = NULL; +		bkey_copy(&w->key, k); -			btree(refill_keybuf, k, b, op, buf, end, pred); -			/* -			 * Might get an error here, but can't really do anything -			 * and it'll get logged elsewhere. Just read what we -			 * can. -			 */ +		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) +			array_free(&buf->freelist, w); +		else +			refill->nr_found++; -			if (bkey_cmp(&buf->last_scanned, end) >= 0) -				break; +		if (array_freelist_empty(&buf->freelist)) +			ret = MAP_DONE; -			cond_resched(); -		} +		spin_unlock(&buf->lock);  	} - -	return 0; +out: +	buf->last_scanned = *k; +	return ret;  }  void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,  		       struct bkey *end, keybuf_pred_fn *pred)  {  	struct bkey start = buf->last_scanned; -	struct btree_op op; -	bch_btree_op_init_stack(&op); +	struct refill refill;  	cond_resched(); -	btree_root(refill_keybuf, c, &op, buf, end, pred); -	closure_sync(&op.cl); +	bch_btree_op_init(&refill.op, -1); +	refill.nr_found	= 0; +	refill.buf	= buf; +	refill.end	= end; +	refill.pred	= pred; + +	bch_btree_map_keys(&refill.op, c, &buf->last_scanned, +			   refill_keybuf_fn, MAP_END_KEY); -	pr_debug("found %s keys from %llu:%llu to %llu:%llu", -		 RB_EMPTY_ROOT(&buf->keys) ? "no" : -		 array_freelist_empty(&buf->freelist) ? "some" : "a few", -		 KEY_INODE(&start), KEY_OFFSET(&start), -		 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); +	trace_bcache_keyscan(refill.nr_found, +			     KEY_INODE(&start), KEY_OFFSET(&start), +			     KEY_INODE(&buf->last_scanned), +			     KEY_OFFSET(&buf->last_scanned));  	spin_lock(&buf->lock); @@ -2436,9 +2486,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)  }  struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, -					     struct keybuf *buf, -					     struct bkey *end, -					     keybuf_pred_fn *pred) +					  struct keybuf *buf, +					  struct bkey *end, +					  keybuf_pred_fn *pred)  {  	struct keybuf_key *ret; @@ -2466,20 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)  	spin_lock_init(&buf->lock);  	array_allocator_init(&buf->freelist);  } - -void bch_btree_exit(void) -{ -	if (btree_io_wq) -		destroy_workqueue(btree_io_wq); -	if (bch_gc_wq) -		destroy_workqueue(bch_gc_wq); -} - -int __init bch_btree_init(void) -{ -	if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || -	    !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) -		return -ENOMEM; - -	return 0; -}  | 
