Merge branch 'for-3.15/drivers' of git://git.kernel.dk/linux-block

Pull block driver update from Jens Axboe: "On top of the core pull request, here's the pull request for the driver related changes for 3.15. It contains: - Improvements for msi-x registration for block drivers (mtip32xx, skd, cciss, nvme) from Alexander Gordeev. - A round of cleanups and improvements for drbd from Andreas Gruenbacher and Rashika Kheria. - A round of clanups and improvements for bcache from Kent. - Removal of sleep_on() and friends in DAC960, ataflop, swim3 from Arnd Bergmann. - Bug fix for a bug in the mtip32xx async completion code from Sam Bradshaw. - Bug fix for accidentally bouncing IO on 32-bit platforms with mtip32xx from Felipe Franciosi" * 'for-3.15/drivers' of git://git.kernel.dk/linux-block: (103 commits) bcache: remove nested function usage bcache: Kill bucket->gc_gen bcache: Kill unused freelist bcache: Rework btree cache reserve handling bcache: Kill btree_io_wq bcache: btree locking rework bcache: Fix a race when freeing btree nodes bcache: Add a real GC_MARK_RECLAIMABLE bcache: Add bch_keylist_init_single() bcache: Improve priority_stats bcache: Better alloc tracepoints bcache: Kill dead cgroup code bcache: stop moving_gc marking buckets that can't be moved. bcache: Fix moving_pred() bcache: Fix moving_gc deadlocking with a foreground write bcache: Fix discard granularity bcache: Fix another bug recovering from unclean shutdown bcache: Fix a bug recovering from unclean shutdown bcache: Fix a journalling reclaim after recovery bug bcache: Fix a null ptr deref in journal replay ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-01 19:43:53 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-01 19:43:53 -0700
commit: b33ce442993865180292df2a314ee5251ba38b50 (patch)
tree: 004b703ea3cd19c932393086fe9cde96e8db8de7 /drivers/md
parent: 7a48837732f87a574ee3e1855927dc250117f565 (diff)
parent: e84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (diff)
17 files changed, 629 insertions, 767 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 2638417b19a..4d200883c50 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
 	Keeps all active closures in a linked list and provides a debugfs
 	interface to list them, which makes it possible to see asynchronous
 	operations that get stuck.
-
-# cgroup code needs to be updated:
-#
-#config CGROUP_BCACHE
-#	bool "Cgroup controls for bcache"
-#	depends on BCACHE && BLK_CGROUP
-#	---help---
-#	TODO
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index c0d37d08244..443d03fbac4 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
 
-	if (CACHE_SYNC(&ca->set->sb)) {
-		ca->need_save_prio = max(ca->need_save_prio,
-					 bucket_disk_gen(b));
-		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-	}
-
 	return ret;
 }
 
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	mutex_unlock(&c->bucket_lock);
 }
 
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
-	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
 }
 
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-
-	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-		unsigned i;
-
-		for (i = 0; i < RESERVE_NONE; i++)
-			if (!fifo_full(&ca->free[i]))
-				goto add;
+	BUG_ON(!ca->set->gc_mark_valid);
 
-		return false;
-	}
-add:
-	b->prio = 0;
-
-	if (can_inc_bucket_gen(b) &&
-	    fifo_push(&ca->unused, b - ca->buckets)) {
-		atomic_inc(&b->pin);
-		return true;
-	}
-
-	return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
-	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+	return (!GC_MARK(b) ||
+		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
 		can_inc_bucket_gen(b);
 }
 
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
+	lockdep_assert_held(&ca->set->bucket_lock);
+	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+	if (GC_SECTORS_USED(b))
+		trace_bcache_invalidate(ca, b - ca->buckets);
+
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	__bch_invalidate_one_bucket(ca, b);
+
 	fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 	ca->heap.used = 0;
 
 	for_each_bucket(b, ca) {
-		/*
-		 * If we fill up the unused list, if we then return before
-		 * adding anything to the free_inc list we'll skip writing
-		 * prios/gens and just go back to allocating from the unused
-		 * list:
-		 */
-		if (fifo_full(&ca->unused))
-			return;
-
-		if (!can_invalidate_bucket(ca, b))
-			continue;
-
-		if (!GC_SECTORS_USED(b) &&
-		    bch_bucket_add_unused(ca, b))
+		if (!bch_can_invalidate_bucket(ca, b))
 			continue;
 
 		if (!heap_full(&ca->heap))
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 			return;
 		}
 
-		invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 	}
 }
 
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
 		b = ca->buckets + ca->fifo_last_bucket++;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
 
 		b = ca->buckets + n;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 static void invalidate_buckets(struct cache *ca)
 {
-	if (ca->invalidate_needs_gc)
-		return;
+	BUG_ON(ca->invalidate_needs_gc);
 
 	switch (CACHE_REPLACEMENT(&ca->sb)) {
 	case CACHE_REPLACEMENT_LRU:
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
 		invalidate_buckets_random(ca);
 		break;
 	}
-
-	trace_bcache_alloc_invalidate(ca);
 }
 
 #define allocator_wait(ca, cond)					\
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
 		 * possibly issue discards to them, then we add the bucket to
 		 * the free list:
 		 */
-		while (1) {
+		while (!fifo_empty(&ca->free_inc)) {
 			long bucket;
 
-			if ((!atomic_read(&ca->set->prio_blocked) ||
-			     !CACHE_SYNC(&ca->set->sb)) &&
-			    !fifo_empty(&ca->unused))
-				fifo_pop(&ca->unused, bucket);
-			else if (!fifo_empty(&ca->free_inc))
-				fifo_pop(&ca->free_inc, bucket);
-			else
-				break;
+			fifo_pop(&ca->free_inc, bucket);
 
 			if (ca->discard) {
 				mutex_unlock(&ca->set->bucket_lock);
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
 			}
 
 			allocator_wait(ca, bch_allocator_push(ca, bucket));
+			wake_up(&ca->set->btree_cache_wait);
 			wake_up(&ca->set->bucket_wait);
 		}
 
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
 		 * them to the free_inc list:
 		 */
 
+retry_invalidate:
 		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       (ca->need_save_prio > 64 ||
-				!ca->invalidate_needs_gc));
+			       !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
 		 * new stuff to them:
 		 */
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-		if (CACHE_SYNC(&ca->set->sb) &&
-		    (!fifo_empty(&ca->free_inc) ||
-		     ca->need_save_prio > 64))
+		if (CACHE_SYNC(&ca->set->sb)) {
+			/*
+			 * This could deadlock if an allocation with a btree
+			 * node locked ever blocked - having the btree node
+			 * locked would block garbage collection, but here we're
+			 * waiting on garbage collection before we invalidate
+			 * and free anything.
+			 *
+			 * But this should be safe since the btree code always
+			 * uses btree_check_reserve() before allocating now, and
+			 * if it fails it blocks without btree nodes locked.
+			 */
+			if (!fifo_full(&ca->free_inc))
+				goto retry_invalidate;
+
 			bch_prio_write(ca);
+		}
 	}
 }
 
+/* Allocation */
+
 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
 	DEFINE_WAIT(w);
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 	    fifo_pop(&ca->free[reserve], r))
 		goto out;
 
-	if (!wait)
+	if (!wait) {
+		trace_bcache_alloc_fail(ca, reserve);
 		return -1;
+	}
 
 	do {
 		prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 out:
 	wake_up_process(ca->alloc_thread);
 
+	trace_bcache_alloc(ca, reserve);
+
 	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
@@ -438,8 +423,6 @@ out:
 				BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
-		fifo_for_each(i, &ca->unused, iter)
-			BUG_ON(i == r);
 	}
 
 	b = ca->buckets + r;
@@ -461,17 +444,19 @@ out:
 	return r;
 }
 
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+	SET_GC_MARK(b, 0);
+	SET_GC_SECTORS_USED(b, 0);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
-		struct bucket *b = PTR_BUCKET(c, k, i);
-
-		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
-		SET_GC_SECTORS_USED(b, 0);
-		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-	}
+	for (i = 0; i < KEY_PTRS(k); i++)
+		__bch_bucket_free(PTR_CACHE(c, k, i),
+				  PTR_BUCKET(c, k, i));
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
 	ca->alloc_thread = k;
 	return 0;
 }
-
-int bch_cache_allocator_init(struct cache *ca)
-{
-	/*
-	 * Reserve:
-	 * Prio/gen writes first
-	 * Then 8 for btree allocations
-	 * Then half for the moving garbage collector
-	 */
-#if 0
-	ca->watermark[WATERMARK_PRIO] = 0;
-
-	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
-
-	ca->watermark[WATERMARK_MOVINGGC] = 8 +
-		ca->watermark[WATERMARK_METADATA];
-
-	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
-		ca->watermark[WATERMARK_MOVINGGC];
-#endif
-	return 0;
-}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index a4c7306ff43..82c9c5d3525 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -195,9 +195,7 @@ struct bucket {
 	atomic_t	pin;
 	uint16_t	prio;
 	uint8_t		gen;
-	uint8_t		disk_gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
-	uint8_t		gc_gen;
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
 };
 
@@ -207,9 +205,9 @@ struct bucket {
  */
 
 BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
-#define GC_MARK_RECLAIMABLE	0
-#define GC_MARK_DIRTY		1
-#define GC_MARK_METADATA	2
+#define GC_MARK_RECLAIMABLE	1
+#define GC_MARK_DIRTY		2
+#define GC_MARK_METADATA	3
 #define GC_SECTORS_USED_SIZE	13
 #define MAX_GC_SECTORS_USED	(~(~0ULL << GC_SECTORS_USED_SIZE))
 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
@@ -426,14 +424,9 @@ struct cache {
 	 * their new gen to disk. After prio_write() finishes writing the new
 	 * gens/prios, they'll be moved to the free list (and possibly discarded
 	 * in the process)
-	 *
-	 * unused: GC found nothing pointing into these buckets (possibly
-	 * because all the data they contained was overwritten), so we only
-	 * need to discard them before they can be moved to the free list.
 	 */
 	DECLARE_FIFO(long, free)[RESERVE_NR];
 	DECLARE_FIFO(long, free_inc);
-	DECLARE_FIFO(long, unused);
 
 	size_t			fifo_last_bucket;
 
@@ -443,12 +436,6 @@ struct cache {
 	DECLARE_HEAP(struct bucket *, heap);
 
 	/*
-	 * max(gen - disk_gen) for all buckets. When it gets too big we have to
-	 * call prio_write() to keep gens from wrapping.
-	 */
-	uint8_t			need_save_prio;
-
-	/*
 	 * If nonzero, we know we aren't going to find any buckets to invalidate
 	 * until a gc finishes - otherwise we could pointlessly burn a ton of
 	 * cpu
@@ -562,19 +549,16 @@ struct cache_set {
 	struct list_head	btree_cache_freed;
 
 	/* Number of elements in btree_cache + btree_cache_freeable lists */
-	unsigned		bucket_cache_used;
+	unsigned		btree_cache_used;
 
 	/*
 	 * If we need to allocate memory for a new btree node and that
 	 * allocation fails, we can cannibalize another node in the btree cache
-	 * to satisfy the allocation. However, only one thread can be doing this
-	 * at a time, for obvious reasons - try_harder and try_wait are
-	 * basically a lock for this that we can wait on asynchronously. The
-	 * btree_root() macro releases the lock when it returns.
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
 	 */
-	struct task_struct	*try_harder;
-	wait_queue_head_t	try_wait;
-	uint64_t		try_harder_start;
+	wait_queue_head_t	btree_cache_wait;
+	struct task_struct	*btree_cache_alloc_lock;
 
 	/*
 	 * When we free a btree node, we increment the gen of the bucket the
@@ -603,7 +587,7 @@ struct cache_set {
 	uint16_t		min_prio;
 
 	/*
-	 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+	 * max(gen - last_gc) for all buckets. When it gets too big we have to gc
 	 * to keep gens from wrapping around.
 	 */
 	uint8_t			need_gc;
@@ -628,6 +612,8 @@ struct cache_set {
 	/* Number of moving GC bios in flight */
 	struct semaphore	moving_in_flight;
 
+	struct workqueue_struct	*moving_gc_wq;
+
 	struct btree		*root;
 
 #ifdef CONFIG_BCACHE_DEBUG
@@ -667,7 +653,6 @@ struct cache_set {
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
 	struct time_stats	btree_read_time;
-	struct time_stats	try_harder_time;
 
 	atomic_long_t		cache_read_races;
 	atomic_long_t		writeback_keys_done;
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
  */
 
 static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
 	return b->gen - b->last_gc;
 }
 
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-	return b->gen - b->disk_gen;
-}
-
 #define BUCKET_GC_GEN_MAX	96U
-#define BUCKET_DISK_GEN_MAX	64U
 
 #define kobj_attribute_write(n, fn)					\
 	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
 
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
 void bch_open_buckets_free(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
-int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
 int bch_debug_init(struct kobject *);
 void bch_request_exit(void);
 int bch_request_init(void);
-void bch_btree_exit(void);
-int bch_btree_init(void);
 
 #endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 3f74b4b0747..54541641530 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
 	for (k = i->start; k < bset_bkey_last(i); k = next) {
 		next = bkey_next(k);
 
-		printk(KERN_ERR "block %u key %li/%u: ", set,
-		       (uint64_t *) k - i->d, i->keys);
+		printk(KERN_ERR "block %u key %u/%u: ", set,
+		       (unsigned) ((u64 *) k - i->d), i->keys);
 
 		if (b->ops->key_dump)
 			b->ops->key_dump(b, k);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 003260f4ddf..5f6728d5d4d 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
 	l->top_p = l->keys_p = l->inline_keys;
 }
 
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+	l->keys = k;
+	l->top = bkey_next(k);
+}
+
 static inline void bch_keylist_push(struct keylist *l)
 {
 	l->top = bkey_next(l->top);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5f9c2a665ca..7347b610096 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -68,15 +68,11 @@
  * alloc_bucket() cannot fail. This should be true but is not completely
  * obvious.
  *
- * Make sure all allocations get charged to the root cgroup
- *
  * Plugging?
  *
  * If data write is less than hard sector size of ssd, round up offset in open
  * bucket to the next whole sector
  *
- * Also lookup by cgroup in get_open_bucket()
- *
  * Superblock needs to be fleshed out for multiple cache devices
  *
  * Add a sysfs tunable for the number of writeback IOs in flight
@@ -97,8 +93,6 @@
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 
-static struct workqueue_struct *btree_io_wq;
-
 #define insert_lock(s, b)	((b)->level <= (s)->lock)
 
 /*
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
 	if (!IS_ERR(_child)) {						\
 		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
 		if (_r == -EINTR)					\
 			schedule();					\
-		bch_cannibalize_unlock(c);				\
-		if (_r == -ENOSPC) {					\
-			wait_event((c)->try_wait,			\
-				   !(c)->try_harder);			\
-			_r = -EINTR;					\
-		}							\
 	} while (_r == -EINTR);						\
 									\
-	finish_wait(&(c)->bucket_wait, &(op)->wait);			\
+	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
 	_r;								\
 })
 
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
 	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
 }
 
+static void bch_btree_init_next(struct btree *b)
+{
+	/* If not a leaf node, always sort */
+	if (b->level && b->keys.nsets)
+		bch_btree_sort(&b->keys, &b->c->sort);
+	else
+		bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(&b->keys, write_block(b),
+				   bset_magic(&b->c->sb));
+
+}
+
 /* Btree key manipulation */
 
 void bkey_put(struct cache_set *c, struct bkey *k)
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
 	btree_complete_write(b, w);
 
 	if (btree_node_dirty(b))
-		queue_delayed_work(btree_io_wq, &b->work,
-				   msecs_to_jiffies(30000));
+		schedule_delayed_work(&b->work, 30 * HZ);
 
 	closure_return_with_destructor(cl, btree_node_write_unlock);
 }
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
 	}
 }
 
-void bch_btree_node_write(struct btree *b, struct closure *parent)
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
 {
 	struct bset *i = btree_bset_last(b);
 
+	lockdep_assert_held(&b->write_lock);
+
 	trace_bcache_btree_write(b);
 
 	BUG_ON(current->bio_list);
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
 	b->written += set_blocks(i, block_bytes(b->c));
+}
 
-	/* If not a leaf node, always sort */
-	if (b->level && b->keys.nsets)
-		bch_btree_sort(&b->keys, &b->c->sort);
-	else
-		bch_btree_sort_lazy(&b->keys, &b->c->sort);
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+	unsigned nsets = b->keys.nsets;
+
+	lockdep_assert_held(&b->lock);
+
+	__bch_btree_node_write(b, parent);
 
 	/*
 	 * do verify if there was more than one set initially (i.e. we did a
 	 * sort) and we sorted down to a single set:
 	 */
-	if (i != b->keys.set->data && !b->keys.nsets)
+	if (nsets && !b->keys.nsets)
 		bch_btree_verify(b);
 
-	if (b->written < btree_blocks(b))
-		bch_bset_init_next(&b->keys, write_block(b),
-				   bset_magic(&b->c->sb));
+	bch_btree_init_next(b);
 }
 
 static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
 	struct closure cl;
 
 	closure_init_stack(&cl);
+
+	mutex_lock(&b->write_lock);
 	bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
 	closure_sync(&cl);
 }
 
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
 
-	rw_lock(true, b, b->level);
-
+	mutex_lock(&b->write_lock);
 	if (btree_node_dirty(b))
-		bch_btree_node_write(b, NULL);
-	rw_unlock(true, b);
+		__bch_btree_node_write(b, NULL);
+	mutex_unlock(&b->write_lock);
 }
 
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 	struct bset *i = btree_bset_last(b);
 	struct btree_write *w = btree_current_write(b);
 
+	lockdep_assert_held(&b->write_lock);
+
 	BUG_ON(!b->written);
 	BUG_ON(!i->keys);
 
 	if (!btree_node_dirty(b))
-		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+		schedule_delayed_work(&b->work, 30 * HZ);
 
 	set_btree_node_dirty(b);
 
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 #define mca_reserve(c)	(((c->root && c->root->level)		\
 			  ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)						\
-	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
 
 static void mca_data_free(struct btree *b)
 {
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
 
 	bch_btree_keys_free(&b->keys);
 
-	b->c->bucket_cache_used--;
+	b->c->btree_cache_used--;
 	list_move(&b->list, &b->c->btree_cache_freed);
 }
 
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 					ilog2(b->c->btree_pages),
 					btree_order(k)),
 				  gfp)) {
-		b->c->bucket_cache_used++;
+		b->c->btree_cache_used++;
 		list_move(&b->list, &b->c->btree_cache);
 	} else {
 		list_move(&b->list, &b->c->btree_cache_freed);
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 
 	init_rwsem(&b->lock);
 	lockdep_set_novalidate_class(&b->lock);
+	mutex_init(&b->write_lock);
+	lockdep_set_novalidate_class(&b->write_lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 	b->c = c;
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 		up(&b->io_mutex);
 	}
 
+	mutex_lock(&b->write_lock);
 	if (btree_node_dirty(b))
-		bch_btree_node_write_sync(b);
+		__bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
+	closure_sync(&cl);
 
 	/* wait for any in flight btree write */
 	down(&b->io_mutex);
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 	if (c->shrinker_disabled)
 		return SHRINK_STOP;
 
-	if (c->try_harder)
+	if (c->btree_cache_alloc_lock)
 		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		}
 	}
 
-	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
 		if (list_empty(&c->btree_cache))
 			goto out;
 
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
 	if (c->shrinker_disabled)
 		return 0;
 
-	if (c->try_harder)
+	if (c->btree_cache_alloc_lock)
 		return 0;
 
 	return mca_can_free(c) * c->btree_pages;
@@ -819,17 +835,30 @@ out:
 	return b;
 }
 
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+	struct task_struct *old;
+
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old && old != current) {
+		if (op)
+			prepare_to_wait(&c->btree_cache_wait, &op->wait,
+					TASK_UNINTERRUPTIBLE);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+				     struct bkey *k)
 {
 	struct btree *b;
 
 	trace_bcache_btree_cache_cannibalize(c);
 
-	if (!c->try_harder) {
-		c->try_harder = current;
-		c->try_harder_start = local_clock();
-	} else if (c->try_harder != current)
-		return ERR_PTR(-ENOSPC);
+	if (mca_cannibalize_lock(c, op))
+		return ERR_PTR(-EINTR);
 
 	list_for_each_entry_reverse(b, &c->btree_cache, list)
 		if (!mca_reap(b, btree_order(k), false))
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
 		if (!mca_reap(b, btree_order(k), true))
 			return b;
 
+	WARN(1, "btree cache cannibalize failed\n");
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  */
 static void bch_cannibalize_unlock(struct cache_set *c)
 {
-	if (c->try_harder == current) {
-		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
-		c->try_harder = NULL;
-		wake_up(&c->try_wait);
+	if (c->btree_cache_alloc_lock == current) {
+		c->btree_cache_alloc_lock = NULL;
+		wake_up(&c->btree_cache_wait);
 	}
 }
 
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+			       struct bkey *k, int level)
 {
 	struct btree *b;
 
@@ -920,7 +950,7 @@ err:
 	if (b)
 		rw_unlock(true, b);
 
-	b = mca_cannibalize(c, k);
+	b = mca_cannibalize(c, op, k);
 	if (!IS_ERR(b))
 		goto out;
 
@@ -936,8 +966,8 @@ err:
  * The btree node will have either a read or a write lock held, depending on
  * level and op->lock.
  */
-struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
-				 int level, bool write)
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+				 struct bkey *k, int level, bool write)
 {
 	int i = 0;
 	struct btree *b;
@@ -951,7 +981,7 @@ retry:
 			return ERR_PTR(-EAGAIN);
 
 		mutex_lock(&c->bucket_lock);
-		b = mca_alloc(c, k, level);
+		b = mca_alloc(c, op, k, level);
 		mutex_unlock(&c->bucket_lock);
 
 		if (!b)
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 	struct btree *b;
 
 	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, k, level);
+	b = mca_alloc(c, NULL, k, level);
 	mutex_unlock(&c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 
 static void btree_node_free(struct btree *b)
 {
-	unsigned i;
-
 	trace_bcache_btree_node_free(b);
 
 	BUG_ON(b == b->c->root);
 
+	mutex_lock(&b->write_lock);
+
 	if (btree_node_dirty(b))
 		btree_complete_write(b, btree_current_write(b));
 	clear_bit(BTREE_NODE_dirty, &b->flags);
 
+	mutex_unlock(&b->write_lock);
+
 	cancel_delayed_work(&b->work);
 
 	mutex_lock(&b->c->bucket_lock);
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++) {
-		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
-
-		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
-			    PTR_BUCKET(b->c, &b->key, i));
-	}
-
 	bch_bucket_free(b->c, &b->key);
 	mca_bucket_free(b);
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+				   int level)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.k
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-01 19:43:53 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-01 19:43:53 -0700
commit	b33ce442993865180292df2a314ee5251ba38b50 (patch)
tree	004b703ea3cd19c932393086fe9cde96e8db8de7 /drivers/md
parent	7a48837732f87a574ee3e1855927dc250117f565 (diff)
parent	e84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (diff)