aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorJoe Thornber <thornber@redhat.com>2011-10-31 20:19:11 +0000
committerAlasdair G Kergon <agk@redhat.com>2011-10-31 20:19:11 +0000
commit3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (patch)
tree499461f724d4db3d7118641f4a20f5be23549edd /drivers/md
parent95d402f057f2e208e4631893f6cd4a59c7c05e41 (diff)
dm: add persistent data library
The persistent-data library offers a re-usable framework for the storage and management of on-disk metadata in device-mapper targets. It's used by the thin-provisioning target in the next patch and in an upcoming hierarchical storage target. For further information, please read Documentation/device-mapper/persistent-data.txt Signed-off-by: Joe Thornber <thornber@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile11
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c620
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h123
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h137
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c566
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c805
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c437
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c705
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h126
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c335
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c400
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h130
21 files changed, 5625 insertions, 0 deletions
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
new file mode 100644
index 00000000000..ceb359050a5
--- /dev/null
+++ b/drivers/md/persistent-data/Kconfig
@@ -0,0 +1,8 @@
+config DM_PERSISTENT_DATA
+ tristate
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select LIBCRC32C
+ select DM_BUFIO
+ ---help---
+ Library providing immutable on-disk data structure support for
+ device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
new file mode 100644
index 00000000000..cfa95f66223
--- /dev/null
+++ b/drivers/md/persistent-data/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
+dm-persistent-data-objs := \
+ dm-block-manager.o \
+ dm-space-map-checker.o \
+ dm-space-map-common.o \
+ dm-space-map-disk.o \
+ dm-space-map-metadata.o \
+ dm-transaction-manager.o \
+ dm-btree.o \
+ dm-btree-remove.o \
+ dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
new file mode 100644
index 00000000000..0317ecdc6e5
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-block-manager.h"
+#include "dm-persistent-data-internal.h"
+#include "../dm-bufio.h"
+
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/device-mapper.h>
+#include <linux/stacktrace.h>
+
+#define DM_MSG_PREFIX "block manager"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This is a read/write semaphore with a couple of differences.
+ *
+ * i) There is a restriction on the number of concurrent read locks that
+ * may be held at once. This is just an implementation detail.
+ *
+ * ii) Recursive locking attempts are detected and return EINVAL. A stack
+ * trace is also emitted for the previous lock aquisition.
+ *
+ * iii) Priority is given to write locks.
+ */
+#define MAX_HOLDERS 4
+#define MAX_STACK 10
+
+typedef unsigned long stack_entries[MAX_STACK];
+
+struct block_lock {
+ spinlock_t lock;
+ __s32 count;
+ struct list_head waiters;
+ struct task_struct *holders[MAX_HOLDERS];
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ struct stack_trace traces[MAX_HOLDERS];
+ stack_entries entries[MAX_HOLDERS];
+#endif
+};
+
+struct waiter {
+ struct list_head list;
+ struct task_struct *task;
+ int wants_write;
+};
+
+static unsigned __find_holder(struct block_lock *lock,
+ struct task_struct *task)
+{
+ unsigned i;
+
+ for (i = 0; i < MAX_HOLDERS; i++)
+ if (lock->holders[i] == task)
+ break;
+
+ BUG_ON(i == MAX_HOLDERS);
+ return i;
+}
+
+/* call this *after* you increment lock->count */
+static void __add_holder(struct block_lock *lock, struct task_struct *task)
+{
+ unsigned h = __find_holder(lock, NULL);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ struct stack_trace *t;
+#endif
+
+ get_task_struct(task);
+ lock->holders[h] = task;
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ t = lock->traces + h;
+ t->nr_entries = 0;
+ t->max_entries = MAX_STACK;
+ t->entries = lock->entries[h];
+ t->skip = 2;
+ save_stack_trace(t);
+#endif
+}
+
+/* call this *before* you decrement lock->count */
+static void __del_holder(struct block_lock *lock, struct task_struct *task)
+{
+ unsigned h = __find_holder(lock, task);
+ lock->holders[h] = NULL;
+ put_task_struct(task);
+}
+
+static int __check_holder(struct block_lock *lock)
+{
+ unsigned i;
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ static struct stack_trace t;
+ static stack_entries entries;
+#endif
+
+ for (i = 0; i < MAX_HOLDERS; i++) {
+ if (lock->holders[i] == current) {
+ DMERR("recursive lock detected in pool metadata");
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ DMERR("previously held here:");
+ print_stack_trace(lock->traces + i, 4);
+
+ DMERR("subsequent aquisition attempted here:");
+ t.nr_entries = 0;
+ t.max_entries = MAX_STACK;
+ t.entries = entries;
+ t.skip = 3;
+ save_stack_trace(&t);
+ print_stack_trace(&t, 4);
+#endif
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void __wait(struct waiter *w)
+{
+ for (;;) {
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+
+ if (!w->task)
+ break;
+
+ schedule();
+ }
+
+ set_task_state(current, TASK_RUNNING);
+}
+
+static void __wake_waiter(struct waiter *w)
+{
+ struct task_struct *task;
+
+ list_del(&w->list);
+ task = w->task;
+ smp_mb();
+ w->task = NULL;
+ wake_up_process(task);
+}
+
+/*
+ * We either wake a few readers or a single writer.
+ */
+static void __wake_many(struct block_lock *lock)
+{
+ struct waiter *w, *tmp;
+
+ BUG_ON(lock->count < 0);
+ list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
+ if (lock->count >= MAX_HOLDERS)
+ return;
+
+ if (w->wants_write) {
+ if (lock->count > 0)
+ return; /* still read locked */
+
+ lock->count = -1;
+ __add_holder(lock, w->task);
+ __wake_waiter(w);
+ return;
+ }
+
+ lock->count++;
+ __add_holder(lock, w->task);
+ __wake_waiter(w);
+ }
+}
+
+static void bl_init(struct block_lock *lock)
+{
+ int i;
+
+ spin_lock_init(&lock->lock);
+ lock->count = 0;
+ INIT_LIST_HEAD(&lock->waiters);
+ for (i = 0; i < MAX_HOLDERS; i++)
+ lock->holders[i] = NULL;
+}
+
+static int __available_for_read(struct block_lock *lock)
+{
+ return lock->count >= 0 &&
+ lock->count < MAX_HOLDERS &&
+ list_empty(&lock->waiters);
+}
+
+static int bl_down_read(struct block_lock *lock)
+{
+ int r;
+ struct waiter w;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r) {
+ spin_unlock(&lock->lock);
+ return r;
+ }
+
+ if (__available_for_read(lock)) {
+ lock->count++;
+ __add_holder(lock, current);
+ spin_unlock(&lock->lock);
+ return 0;
+ }
+
+ get_task_struct(current);
+
+ w.task = current;
+ w.wants_write = 0;
+ list_add_tail(&w.list, &lock->waiters);
+ spin_unlock(&lock->lock);
+
+ __wait(&w);
+ put_task_struct(current);
+ return 0;
+}
+
+static int bl_down_read_nonblock(struct block_lock *lock)
+{
+ int r;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r)
+ goto out;
+
+ if (__available_for_read(lock)) {
+ lock->count++;
+ __add_holder(lock, current);
+ r = 0;
+ } else
+ r = -EWOULDBLOCK;
+
+out:
+ spin_unlock(&lock->lock);
+ return r;
+}
+
+static void bl_up_read(struct block_lock *lock)
+{
+ spin_lock(&lock->lock);
+ BUG_ON(lock->count <= 0);
+ __del_holder(lock, current);
+ --lock->count;
+ if (!list_empty(&lock->waiters))
+ __wake_many(lock);
+ spin_unlock(&lock->lock);
+}
+
+static int bl_down_write(struct block_lock *lock)
+{
+ int r;
+ struct waiter w;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r) {
+ spin_unlock(&lock->lock);
+ return r;
+ }
+
+ if (lock->count == 0 && list_empty(&lock->waiters)) {
+ lock->count = -1;
+ __add_holder(lock, current);
+ spin_unlock(&lock->lock);
+ return 0;
+ }
+
+ get_task_struct(current);
+ w.task = current;
+ w.wants_write = 1;
+
+ /*
+ * Writers given priority. We know there's only one mutator in the
+ * system, so ignoring the ordering reversal.
+ */
+ list_add(&w.list, &lock->waiters);
+ spin_unlock(&lock->lock);
+
+ __wait(&w);
+ put_task_struct(current);
+
+ return 0;
+}
+
+static void bl_up_write(struct block_lock *lock)
+{
+ spin_lock(&lock->lock);
+ __del_holder(lock, current);
+ lock->count = 0;
+ if (!list_empty(&lock->waiters))
+ __wake_many(lock);
+ spin_unlock(&lock->lock);
+}
+
+static void report_recursive_bug(dm_block_t b, int r)
+{
+ if (r == -EINVAL)
+ DMERR("recursive acquisition of block %llu requested.",
+ (unsigned long long) b);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Block manager is currently implemented using dm-bufio. struct
+ * dm_block_manager and struct dm_block map directly onto a couple of
+ * structs in the bufio interface. I want to retain the freedom to move
+ * away from bufio in the future. So these structs are just cast within
+ * this .c file, rather than making it through to the public interface.
+ */
+static struct dm_buffer *to_buffer(struct dm_block *b)
+{
+ return (struct dm_buffer *) b;
+}
+
+static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
+{
+ return (struct dm_bufio_client *) bm;
+}
+
+dm_block_t dm_block_location(struct dm_block *b)
+{
+ return dm_bufio_get_block_number(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_location);
+
+void *dm_block_data(struct dm_block *b)
+{
+ return dm_bufio_get_block_data(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_data);
+
+struct buffer_aux {
+ struct dm_block_validator *validator;
+ struct block_lock lock;
+ int write_locked;
+};
+
+static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+ aux->validator = NULL;
+ bl_init(&aux->lock);
+}
+
+static void dm_block_manager_write_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+ if (aux->validator) {
+ aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
+ dm_bufio_get_block_size(dm_bufio_get_client(buf)));
+ }
+}
+
+/*----------------------------------------------------------------
+ * Public interface
+ *--------------------------------------------------------------*/
+struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
+ unsigned block_size,
+ unsigned cache_size,
+ unsigned max_held_per_thread)
+{
+ return (struct dm_block_manager *)
+ dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+ sizeof(struct buffer_aux),
+ dm_block_manager_alloc_callback,
+ dm_block_manager_write_callback);
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_create);
+
+void dm_block_manager_destroy(struct dm_block_manager *bm)
+{
+ return dm_bufio_client_destroy(to_bufio(bm));
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
+
+unsigned dm_bm_block_size(struct dm_block_manager *bm)
+{
+ return dm_bufio_get_block_size(to_bufio(bm));
+}
+EXPORT_SYMBOL_GPL(dm_bm_block_size);
+
+dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
+{
+ return dm_bufio_get_device_size(to_bufio(bm));
+}
+
+static int dm_bm_validate_buffer(struct dm_block_manager *bm,
+ struct dm_buffer *buf,
+ struct buffer_aux *aux,
+ struct dm_block_validator *v)
+{
+ if (unlikely(!aux->validator)) {
+ int r;
+ if (!v)
+ return 0;
+ r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
+ if (unlikely(r))
+ return r;
+ aux->validator = v;
+ } else {
+ if (unlikely(aux->validator != v)) {
+ DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
+ aux->validator->name, v ? v->name : "NULL",
+ (unsigned long long)
+ dm_bufio_get_block_number(buf));
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_read(&aux->lock);
+ if (unlikely(r)) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+
+ aux->write_locked = 0;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_read(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_read_lock);
+
+int dm_bm_write_lock(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_write(&aux->lock);
+ if (r) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+
+ aux->write_locked = 1;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_write(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_write_lock);
+
+int dm_bm_read_try_lock(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+ if (unlikely(!p))
+ return -EWOULDBLOCK;
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_read_nonblock(&aux->lock);
+ if (r < 0) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+ aux->write_locked = 0;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_read(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+
+int dm_bm_write_lock_zero(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ int r;
+ struct buffer_aux *aux;
+ void *p;
+
+ p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ memset(p, 0, dm_bm_block_size(bm));
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_write(&aux->lock);
+ if (r) {
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ aux->write_locked = 1;
+ aux->validator = v;
+
+ return 0;
+}
+
+int dm_bm_unlock(struct dm_block *b)
+{
+ struct buffer_aux *aux;
+ aux = dm_bufio_get_aux_data(to_buffer(b));
+
+ if (aux->write_locked) {
+ dm_bufio_mark_buffer_dirty(to_buffer(b));
+ bl_up_write(&aux->lock);
+ } else
+ bl_up_read(&aux->lock);
+
+ dm_bufio_release(to_buffer(b));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_unlock);
+
+int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
+{
+ struct buffer_aux *aux;
+
+ aux = dm_bufio_get_aux_data(to_buffer(b));
+
+ if (aux->write_locked) {
+ dm_bufio_mark_buffer_dirty(to_buffer(b));
+ bl_up_write(&aux->lock);
+ } else
+ bl_up_read(&aux->lock);
+
+ dm_bufio_release_move(to_buffer(b), n);
+ return 0;
+}
+
+int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
+ struct dm_block *superblock)
+{
+ int r;
+
+ r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+ if (unlikely(r))
+ return r;
+ r = dm_bufio_issue_flush(to_bufio(bm));
+ if (unlikely(r))
+ return r;
+
+ dm_bm_unlock(superblock);
+
+ r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+ if (unlikely(r))
+ return r;
+ r = dm_bufio_issue_flush(to_bufio(bm));
+ if (unlikely(r))
+ return r;
+
+ return 0;
+}
+
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
+{
+ return crc32c(~(u32) 0, data, len) ^ init_xor;
+}
+EXPORT_SYMBOL_GPL(dm_bm_checksum);
+
+/*----------------------------------------------------------------*/
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_DESCRIPTION("Immutable metadata library for dm");
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
new file mode 100644
index 00000000000..924833d2dfa
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_BLOCK_MANAGER_H
+#define _LINUX_DM_BLOCK_MANAGER_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Block number.
+ */
+typedef uint64_t dm_block_t;
+struct dm_block;
+
+dm_block_t dm_block_location(struct dm_block *b);
+void *dm_block_data(struct dm_block *b);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @name should be a unique identifier for the block manager, no longer
+ * than 32 chars.
+ *
+ * @max_held_per_thread should be the maximum number of locks, read or
+ * write, that an individual thread holds at any one time.
+ */
+struct dm_block_manager;
+struct dm_block_manager *dm_block_manager_create(
+ struct block_device *bdev, unsigned block_size,
+ unsigned cache_size, unsigned max_held_per_thread);
+void dm_block_manager_destroy(struct dm_block_manager *bm);
+
+unsigned dm_bm_block_size(struct dm_block_manager *bm);
+dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The validator allows the caller to verify newly-read data and modify
+ * the data just before writing, e.g. to calculate checksums. It's
+ * important to be consistent with your use of validators. The only time
+ * you can change validators is if you call dm_bm_write_lock_zero.
+ */
+struct dm_block_validator {
+ const char *name;
+ void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+
+ /*
+ * Return 0 if the checksum is valid or < 0 on error.
+ */
+ int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * You can have multiple concurrent readers or a single writer holding a
+ * block lock.
+ */
+
+/*
+ * dm_bm_lock() locks a block and returns through @result a pointer to
+ * memory that holds a copy of that block. If you have write-locked the
+ * block then any changes you make to memory pointed to by @result will be
+ * written back to the disk sometime after dm_bm_unlock is called.
+ */
+int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+/*
+ * The *_try_lock variants return -EWOULDBLOCK if the block isn't
+ * available immediately.
+ */
+int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+/*
+ * Use dm_bm_write_lock_zero() when you know you're going to
+ * overwrite the block completely. It saves a disk read.
+ */
+int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+int dm_bm_unlock(struct dm_block *b);
+
+/*
+ * An optimisation; we often want to copy a block's contents to a new
+ * block. eg, as part of the shadowing operation. It's far better for
+ * bufio to do this move behind the scenes than hold 2 locks and memcpy the
+ * data.
+ */
+int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
+
+/*
+ * It's a common idiom to have a superblock that should be committed last.
+ *
+ * @superblock should be write-locked on entry. It will be unlocked during
+ * this function. All dirty blocks are guaranteed to be written and flushed
+ * before the superblock.
+ *
+ * This method always blocks.
+ */
+int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
+ struct dm_block *superblock);
+
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
new file mode 100644
index 00000000000..d279c768f8f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BTREE_INTERNAL_H
+#define DM_BTREE_INTERNAL_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We'll need 2 accessor functions for n->csum and n->blocknr
+ * to support dm-btree-spine.c in that case.
+ */
+
+enum node_flags {
+ INTERNAL_NODE = 1,
+ LEAF_NODE = 1 << 1
+};
+
+/*
+ * Every btree node begins with this structure. Make sure it's a multiple
+ * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
+ */
+struct node_header {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr; /* Block this node is supposed to live in. */
+
+ __le32 nr_entries;
+ __le32 max_entries;
+ __le32 value_size;
+ __le32 padding;
+} __packed;
+
+struct node {
+ struct node_header header;
+ __le64 keys[0];
+} __packed;
+
+
+void inc_children(struct dm_transaction_manager *tm, struct node *n,
+ struct dm_btree_value_type *vt);
+
+int new_block(struct dm_btree_info *info, struct dm_block **result);
+int unlock_block(struct dm_btree_info *info, struct dm_block *b);
+
+/*
+ * Spines keep track of the rolling locks. There are 2 variants, read-only
+ * and one that uses shadowing. These are separate structs to allow the
+ * type checker to spot misuse, for example accidentally calling read_lock
+ * on a shadow spine.
+ */
+struct ro_spine {
+ struct dm_btree_info *info;
+
+ int count;
+ struct dm_block *nodes[2];
+};
+
+void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
+int exit_ro_spine(struct ro_spine *s);
+int ro_step(struct ro_spine *s, dm_block_t new_child);
+struct node *ro_node(struct ro_spine *s);
+
+struct shadow_spine {
+ struct dm_btree_info *info;
+
+ int count;
+ struct dm_block *nodes[2];
+
+ dm_block_t root;
+};
+
+void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
+int exit_shadow_spine(struct shadow_spine *s);
+
+int shadow_step(struct shadow_spine *s, dm_block_t b,
+ struct dm_btree_value_type *vt);
+
+/*
+ * The spine must have at least one entry before calling this.
+ */
+struct dm_block *shadow_current(struct shadow_spine *s);
+
+/*
+ * The spine must have at least two entries before calling this.
+ */
+struct dm_block *shadow_parent(struct shadow_spine *s);
+
+int shadow_has_parent(struct shadow_spine *s);
+
+int shadow_root(struct shadow_spine *s);
+
+/*
+ * Some inlines.
+ */
+static inline __le64 *key_ptr(struct node *n, uint32_t index)
+{
+ return n->keys + index;
+}
+
+static inline void *value_base(struct node *n)
+{
+ return &n->keys[le32_to_cpu(n->header.max_entries)];
+}
+
+/*
+ * FIXME: Now that value size is stored in node we don't need the third parm.
+ */
+static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
+{
+ BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+ return value_base(n) + (value_size * index);
+}
+
+/*
+ * Assumes the values are suitably-aligned and converts to core format.
+ */
+static inline uint64_t value64(struct node *n, uint32_t index)
+{
+ __le64 *values_le = value_base(n);
+
+ return le64_to_cpu(values_le[index]);
+}
+
+/*
+ * Searching for a key within a single node.
+ */
+int lower_bound(struct node *n, uint64_t key);
+
+extern struct dm_block_validator btree_node_validator;
+
+#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
new file mode 100644
index 00000000000..65fd85ec651
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-btree.h"
+#include "dm-btree-internal.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/module.h>
+
+/*
+ * Removing an entry from a btree
+ * ==============================
+ *
+ * A very important constraint for our btree is that no node, except the
+ * root, may have fewer than a certain number of entries.
+ * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
+ *
+ * Ensuring this is complicated by the way we want to only ever hold the
+ * locks on 2 nodes concurrently, and only change nodes in a top to bottom
+ * fashion.
+ *
+ * Each node may have a left or right sibling. When decending the spine,
+ * if a node contains only MIN_ENTRIES then we try and increase this to at
+ * least MIN_ENTRIES + 1. We do this in the following ways:
+ *
+ * [A] No siblings => this can only happen if the node is the root, in which
+ * case we copy the childs contents over the root.
+ *
+ * [B] No left sibling
+ * ==> rebalance(node, right sibling)
+ *
+ * [C] No right sibling
+ * ==> rebalance(left sibling, node)
+ *
+ * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
+ * ==> delete node adding it's contents to left and right
+ *
+ * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
+ * ==> rebalance(left, node, right)
+ *
+ * After these operations it's possible that the our original node no
+ * longer contains the desired sub tree. For this reason this rebalancing
+ * is performed on the children of the current node. This also avoids
+ * having a special case for the root.
+ *
+ * Once this rebalancing has occurred we can then step into the child node
+ * for internal nodes. Or delete the entry for leaf nodes.
+ */
+
+/*
+ * Some little utilities for moving node data around.
+ */
+static void node_shift(struct node *n, int shift)
+{
+ uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
+
+ if (shift < 0) {
+ shift = -shift;
+ BUG_ON(shift > nr_entries);
+ BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+ memmove(key_ptr(n, 0),
+ key_ptr(n, shift),
+ (nr_entries - shift) * sizeof(__le64));
+ memmove(value_ptr(n, 0, value_size),
+ value_ptr(n, shift, value_size),
+ (nr_entries - shift) * value_size);
+ } else {
+ BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
+ memmove(key_ptr(n, shift),
+ key_ptr(n, 0),
+ nr_entries * sizeof(__le64));
+ memmove(value_ptr(n, shift, value_size),
+ value_ptr(n, 0, value_size),