aboutsummaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2009-09-25 16:07:19 -0700
committerJens Axboe <jens.axboe@oracle.com>2009-10-01 21:17:49 +0200
commitb411b3637fa71fce9cf2acf0639009500f5892fe (patch)
tree6b88e5202e0f137fef50e95b0441bcafdbf91990 /drivers/block
parent1a35e0f6443f4266dad4c569c55c57a9032596fa (diff)
The DRBD driver
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/drbd/Kconfig82
-rw-r--r--drivers/block/drbd/Makefile8
-rw-r--r--drivers/block/drbd/drbd_actlog.c1484
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1327
-rw-r--r--drivers/block/drbd/drbd_int.h2258
-rw-r--r--drivers/block/drbd/drbd_main.c3735
-rw-r--r--drivers/block/drbd/drbd_nl.c2365
-rw-r--r--drivers/block/drbd/drbd_proc.c266
-rw-r--r--drivers/block/drbd/drbd_receiver.c4456
-rw-r--r--drivers/block/drbd/drbd_req.c1132
-rw-r--r--drivers/block/drbd/drbd_req.h327
-rw-r--r--drivers/block/drbd/drbd_strings.c113
-rw-r--r--drivers/block/drbd/drbd_tracing.c752
-rw-r--r--drivers/block/drbd/drbd_tracing.h87
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c1529
-rw-r--r--drivers/block/drbd/drbd_wrappers.h91
19 files changed, 20366 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c5..77bfce52e9c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
instead, which can be configured to be on-disk compatible with the
cryptoloop device.
+source "drivers/block/drbd/Kconfig"
+
config BLK_DEV_NBD
tristate "Network block device support"
depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf..aff5ac925c3 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 00000000000..4e6f90f487c
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,82 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+ depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+ tristate "DRBD Distributed Replicated Block Device support"
+ depends on PROC_FS && INET && CONNECTOR
+ select LRU_CACHE
+ default n
+ help
+
+ NOTE: In order to authenticate connections you have to select
+ CRYPTO_HMAC and a hash function as well.
+
+ DRBD is a shared-nothing, synchronously replicated block device. It
+ is designed to serve as a building block for high availability
+ clusters and in this context, is a "drop-in" replacement for shared
+ storage. Simplistically, you could see it as a network RAID 1.
+
+ Each minor device has a role, which can be 'primary' or 'secondary'.
+ On the node with the primary device the application is supposed to
+ run and to access the device (/dev/drbdX). Every write is sent to
+ the local 'lower level block device' and, across the network, to the
+ node with the device in 'secondary' state. The secondary device
+ simply writes the data to its lower level block device.
+
+ DRBD can also be used in dual-Primary mode (device writable on both
+ nodes), which means it can exhibit shared disk semantics in a
+ shared-nothing cluster. Needless to say, on top of dual-Primary
+ DRBD utilizing a cluster file system is necessary to maintain for
+ cache coherency.
+
+ For automatic failover you need a cluster manager (e.g. heartbeat).
+ See also: http://www.drbd.org/, http://www.linux-ha.org
+
+ If unsure, say N.
+
+config DRBD_TRACE
+ tristate "DRBD tracing"
+ depends on BLK_DEV_DRBD
+ select TRACEPOINTS
+ default n
+ help
+
+ Say Y here if you want to be able to trace various events in DRBD.
+
+ If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+ bool "DRBD fault injection"
+ depends on BLK_DEV_DRBD
+ help
+
+ Say Y here if you want to simulate IO errors, in order to test DRBD's
+ behavior.
+
+ The actual simulation of IO errors is done by writing 3 values to
+ /sys/module/drbd/parameters/
+
+ enable_faults: bitmask of...
+ 1 meta data write
+ 2 read
+ 4 resync data write
+ 8 read
+ 16 data write
+ 32 data read
+ 64 read ahead
+ 128 kmalloc of bitmap
+ 256 allocation of EE (epoch_entries)
+
+ fault_devs: bitmask of minor numbers
+ fault_rate: frequency in percent
+
+ Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+ echo 16 > /sys/module/drbd/parameters/enable_faults
+ echo 1 > /sys/module/drbd/parameters/fault_devs
+ echo 5 > /sys/module/drbd/parameters/fault_rate
+
+ If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 00000000000..7d86ef8a8b4
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+drbd_trace-y := drbd_tracing.o
+
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
+obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 00000000000..74b4835d310
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1484 @@
+/*
+ drbd_actlog.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial check sum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while loosing power.
+ */
+struct __packed al_transaction {
+ u32 magic;
+ u32 tr_number;
+ struct __packed {
+ u32 pos;
+ u32 extent; } updates[1 + AL_EXTENTS_PT];
+ u32 xor_sum;
+};
+
+struct update_odbm_work {
+ struct drbd_work w;
+ unsigned int enr;
+};
+
+struct update_al_work {
+ struct drbd_work w;
+ struct lc_element *al_ext;
+ struct completion event;
+ unsigned int enr;
+ /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+ unsigned int old_enr;
+};
+
+struct drbd_atodb_wait {
+ atomic_t count;
+ struct completion io_done;
+ struct drbd_conf *mdev;
+ int error;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+/* The actual tracepoint needs to have constant number of known arguments...
+ */
+void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ trace__drbd_resync(mdev, level, fmt, ap);
+ va_end(ap);
+}
+
+static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev,
+ struct page *page, sector_t sector,
+ int rw, int size)
+{
+ struct bio *bio;
+ struct drbd_md_io md_io;
+ int ok;
+
+ md_io.mdev = mdev;
+ init_completion(&md_io.event);
+ md_io.error = 0;
+
+ if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+ rw |= (1 << BIO_RW_BARRIER);
+ rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+
+ retry:
+ bio = bio_alloc(GFP_NOIO, 1);
+ bio->bi_bdev = bdev->md_bdev;
+ bio->bi_sector = sector;
+ ok = (bio_add_page(bio, page, size, 0) == size);
+ if (!ok)
+ goto out;
+ bio->bi_private = &md_io;
+ bio->bi_end_io = drbd_md_io_complete;
+ bio->bi_rw = rw;
+
+ trace_drbd_bio(mdev, "Md", bio, 0, NULL);
+
+ if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ bio_endio(bio, -EIO);
+ else
+ submit_bio(rw, bio);
+ wait_for_completion(&md_io.event);
+ ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+
+ /* check for unsupported barrier op.
+ * would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0 */
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+ /* Try again with no barrier */
+ dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+ set_bit(MD_NO_BARRIER, &mdev->flags);
+ rw &= ~(1 << BIO_RW_BARRIER);
+ bio_put(bio);
+ goto retry;
+ }
+ out:
+ bio_put(bio);
+ return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ int logical_block_size, mask, ok;
+ int offset = 0;
+ struct page *iop = mdev->md_io_page;
+
+ D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+
+ BUG_ON(!bdev->md_bdev);
+
+ logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+ if (logical_block_size == 0)
+ logical_block_size = MD_SECTOR_SIZE;
+
+ /* in case logical_block_size != 512 [ s390 only? ] */
+ if (logical_block_size != MD_SECTOR_SIZE) {
+ mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+ D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+ D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+ offset = sector & mask;
+ sector = sector & ~mask;
+ iop = mdev->md_io_tmpp;
+
+ if (rw & WRITE) {
+ /* these are GFP_KERNEL pages, pre-allocated
+ * on device initialization */
+ void *p = page_address(mdev->md_io_page);
+ void *hp = page_address(mdev->md_io_tmpp);
+
+ ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+ READ, logical_block_size);
+
+ if (unlikely(!ok)) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+ "READ [logical_block_size!=512]) failed!\n",
+ (unsigned long long)sector);
+ return 0;
+ }
+
+ memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+ }
+ }
+
+ if (sector < drbd_md_first_sector(bdev) ||
+ sector > drbd_md_last_sector(bdev))
+ dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+ ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+ if (unlikely(!ok)) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+ return 0;
+ }
+
+ if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+ void *p = page_address(mdev->md_io_page);
+ void *hp = page_address(mdev->md_io_tmpp);
+
+ memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+ }
+
+ return ok;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+ struct lc_element *al_ext;
+ struct lc_element *tmp;
+ unsigned long al_flags = 0;
+
+ spin_lock_irq(&mdev->al_lock);
+ tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ spin_unlock_irq(&mdev->al_lock);
+ return NULL;
+ }
+ }
+ al_ext = lc_get(mdev->act_log, enr);
+ al_flags = mdev->act_log->flags;
+ spin_unlock_irq(&mdev->al_lock);
+
+ /*
+ if (!al_ext) {
+ if (al_flags & LC_STARVING)
+ dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+ if (al_flags & LC_DIRTY)
+ dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+ }
+ */
+
+ return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ struct lc_element *al_ext;
+ struct update_al_work al_work;
+
+ D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+ trace_drbd_actlog(mdev, sector, "al_begin_io");
+
+ wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+ if (al_ext->lc_number != enr) {
+ /* drbd_al_write_transaction(mdev,al_ext,enr);
+ * recurses into generic_make_request(), which
+ * disallows recursion, bios being serialized on the
+ * current->bio_tail list now.
+ * we have to delegate updates to the activity log
+ * to the worker thread. */
+ init_completion(&al_work.event);
+ al_work.al_ext = al_ext;
+ al_work.enr = enr;
+ al_work.old_enr = al_ext->lc_number;
+ al_work.w.cb = w_al_write_transaction;
+ drbd_queue_work_front(&mdev->data.work, &al_work.w);
+ wait_for_completion(&al_work.event);
+
+ mdev->al_writ_cnt++;
+
+ spin_lock_irq(&mdev->al_lock);
+ lc_changed(mdev->act_log, al_ext);
+ spin_unlock_irq(&mdev->al_lock);
+ wake_up(&mdev->al_wait);
+ }
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+ unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ struct lc_element *extent;
+ unsigned long flags;
+
+ trace_drbd_actlog(mdev, sector, "al_complete_io");
+
+ spin_lock_irqsave(&mdev->al_lock, flags);
+
+ extent = lc_find(mdev->act_log, enr);
+
+ if (!extent) {
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+ dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+ return;
+ }
+
+ if (lc_put(mdev->act_log, extent) == 0)
+ wake_up(&mdev->al_wait);
+
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct lc_element *updated = aw->al_ext;
+ const unsigned int new_enr = aw->enr;
+ const unsigned int evicted = aw->old_enr;
+ struct al_transaction *buffer;
+ sector_t sector;
+ int i, n, mx;
+ unsigned int extent_nr;
+ u32 xor_sum = 0;
+
+ if (!get_ldev(mdev)) {
+ dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+ complete(&((struct update_al_work *)w)->event);
+ return 1;
+ }
+ /* do we have to do a bitmap write, first?
+ * TODO reduce maximum latency:
+ * submit both bios, then wait for both,
+ * instead of doing two synchronous sector writes. */
+ if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+ drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+
+ mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+ buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+
+ buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+ buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+ n = lc_index_of(mdev->act_log, updated);
+
+ buffer->updates[0].pos = cpu_to_be32(n);
+ buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+ xor_sum ^= new_enr;
+
+ mx = min_t(int, AL_EXTENTS_PT,
+ mdev->act_log->nr_elements - mdev->al_tr_cycle);
+ for (i = 0; i < mx; i++) {
+ unsigned idx = mdev->al_tr_cycle + i;
+ extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+ buffer->updates[i+1].pos = cpu_to_be32(idx);
+ buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+ xor_sum ^= extent_nr;
+ }
+ for (; i < AL_EXTENTS_PT; i++) {
+ buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+ buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+ xor_sum ^= LC_FREE;
+ }
+ mdev->al_tr_cycle += AL_EXTENTS_PT;
+ if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+ mdev->al_tr_cycle = 0;
+
+ buffer->xor_sum = cpu_to_be32(xor_sum);
+
+ sector = mdev->ldev->md.md_offset
+ + mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+ if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+ drbd_chk_io_error(mdev, 1, TRUE);
+
+ if (++mdev->al_tr_pos >
+ div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+ mdev->al_tr_pos = 0;
+
+ D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+ mdev->al_tr_number++;
+
+ mutex_unlock(&mdev->md_io_mutex);
+
+ complete(&((struct update_al_work *)w)->event);
+ put_ldev(mdev);
+
+ return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev: DRBD device.
+ * @bdev: Block device to read form.
+ * @b: pointer to an al_transaction.
+ * @index: On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+static int drbd_al_read_tr(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev,
+ struct al_transaction *b,
+ int index)
+{
+ sector_t sector;
+ int rv, i;
+ u32 xor_sum = 0;
+
+ sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+ /* Dont process error normally,
+ * as this is done before disk is attached! */
+ if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+ return -1;
+
+ rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+ for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+ xor_sum ^= be32_to_cpu(b->updates[i].extent);
+ rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+ return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev: DRBD device.
+ * @bdev: Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ struct al_transaction *buffer;
+ int i;
+ int rv;
+ int mx;
+ int active_extents = 0;
+ int transactions = 0;
+ int found_valid = 0;
+ int from = 0;
+ int to = 0;
+ u32 from_tnr = 0;
+ u32 to_tnr = 0;
+ u32 cnr;
+
+ mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+ /* lock out all other meta data io for now,
+ * and make sure the page is mapped.
+ */
+ mutex_lock(&mdev->md_io_mutex);
+ buffer = page_address(mdev->md_io_page);
+
+ /* Find the valid transaction in the log */
+ for (i = 0; i <= mx; i++) {
+ rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+ if (rv == 0)
+ continue;
+ if (rv == -1) {
+ mutex_unlock(&mdev->md_io_mutex);
+ return 0;
+ }
+ cnr = be32_to_cpu(buffer->tr_number);
+
+ if (++found_valid == 1) {
+ from = i;
+ to = i;
+ from_tnr = cnr;
+ to_tnr = cnr;
+ continue;
+ }
+ if ((int)cnr - (int)from_tnr < 0) {
+ D_ASSERT(from_tnr - cnr + i - from == mx+1);
+ from = i;
+ from_tnr = cnr;
+ }
+ if ((int)cnr - (int)to_tnr > 0) {
+ D_ASSERT(cnr - to_tnr == i - to);
+ to = i;
+ to_tnr = cnr;
+ }
+ }
+
+ if (!found_valid) {
+ dev_warn(DEV, "No usable activity log found.\n");
+ mutex_unlock(&mdev->md_io_mutex);
+ return 1;
+ }
+
+ /* Read the valid transactions.
+ * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+ i = from;
+ while (1) {
+ int j, pos;
+ unsigned int extent_nr;
+ unsigned int trn;
+
+ rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+ ERR_IF(rv == 0) goto cancel;
+ if (rv == -1) {
+ mutex_unlock(&mdev->md_io_mutex);
+ return 0;
+ }
+
+ trn = be32_to_cpu(buffer->tr_number);
+
+ spin_lock_irq(&mdev->al_lock);
+
+ /* This loop runs backwards because in the cyclic
+ elements there might be an old version of the
+ updated element (in slot 0). So the element in slot 0
+ can overwrite old versions. */
+ for (j = AL_EXTENTS_PT; j >= 0; j--) {
+ pos = be32_to_cpu(buffer->updates[j].pos);
+ extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+ if (extent_nr == LC_FREE)
+ continue;
+
+ lc_set(mdev->act_log, extent_nr, pos);
+ active_extents++;
+ }
+ spin_unlock_irq(&mdev->al_lock);
+
+ transactions++;
+
+cancel:
+ if (i == to)
+ break;
+ i++;
+ if (i > mx)
+ i = 0;
+ }
+
+ mdev->al_tr_number = to_tnr+1;
+ mdev->al_tr_pos = to;
+ if (++mdev->al_tr_pos >
+ div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+ mdev->al_tr_pos = 0;
+
+ /* ok, we are done with it */
+ mutex_unlock(&mdev->md_io_mutex);
+
+ dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+ transactions, active_extents);
+
+ return 1;
+}
+
+static void atodb_endio(struct bio *bio, int error)
+{
+ struct drbd_atodb_wait *wc = bio->bi_private;
+ struct drbd_conf *mdev = wc->mdev;
+ struct page *page;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ if (!error && !uptodate)
+ error = -EIO;
+
+ drbd_chk_io_error(mdev, error, TRUE);
+ if (error && wc->error == 0)
+ wc->error = error;
+
+ if (atomic_dec_and_test(&wc->count))
+ complete(&wc->io_done);
+
+ page = bio->bi_io_vec[0].bv_page;
+ put_page(page);
+ bio_put(bio);
+ mdev->bm_writ_cnt++;
+ put_ldev(mdev);
+}
+
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* activity log to on disk bitmap -- prepare bio unless that sector
+ * is already covered by previously prepared bios */
+static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
+ struct bio **bios,
+ unsigned int enr,
+ struct drbd_atodb_wait *wc) __must_hold(local)
+{
+ struct bio *bio;
+ struct page *page;
+ sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+ + mdev->ldev->md.bm_offset;
+ unsigned int page_offset = PAGE_SIZE;
+ int offset;
+ int i = 0;
+ int err = -ENOMEM;
+
+ /* Check if that enr is already covered by an already created bio.
+ * Caution, bios[] is not NULL terminated,
+ * but only initialized to all NULL.
+ * For completely scattered activity log,
+ * the last invocation iterates over all bios,
+ * and finds the last NULL entry.
+ */
+ while ((bio = bios[i])) {
+ if (bio->bi_sector == on_disk_sector)
+ return 0;
+ i++;
+ }
+ /* bios[i] == NULL, the next not yet used slot */
+
+ /* GFP_KERNEL, we are not in the write-out path */
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ if (i > 0) {
+ const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
+ page_offset = prev_bv->bv_offset + prev_bv->bv_len;
+ page = prev_bv->bv_page;
+ }
+ if (page_offset == PAGE_SIZE) {
+ page = alloc_page(__GFP_HIGHMEM);
+ if (page == NULL)
+ goto out_bio_put;
+ page_offset = 0;
+ } else {
+ get_page(page);
+ }
+
+ offset = S2W(enr);
+ drbd_bm_get_lel(mdev, offset,
+ min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+ kmap(page) + page_offset);
+ kunmap(page);
+
+ bio->bi_private = wc;
+ bio->bi_end_io = atodb_endio;
+ bio->bi_bdev = mdev->ldev->md_bdev;
+ bio->bi_sector = on_disk_sector;
+
+ if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
+ goto out_put_page;
+
+ atomic_inc(&wc->count);
+ /* we already know that we may do this...
+ * get_ldev_if_state(mdev,D_ATTACHING);
+ * just get the extra reference, so that the local_cnt reflects
+ * the number of pending IO requests DRBD at its backing device.
+ */
+ atomic_inc(&mdev->local_cnt);
+
+ bios[i] = bio;
+
+ return 0;
+
+out_put_page:
+ err = -EINVAL;
+ put_page(page);
+out_bio_put:
+ bio_put(bio);
+ return err;
+}
+
+/**
+ * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
+ * @mdev: DRBD device.
+ *
+ * Called when we detach (unconfigure) local storage,
+ * or when we go from R_PRIMARY to R_SECONDARY role.
+ */
+void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+{
+ int i, nr_elements;
+ unsigned int enr;
+ struct bio **bios;
+ struct drbd_atodb_wait wc;
+
+ ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
+ return; /* sorry, I don't have any act_log etc... */
+
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+ nr_elements = mdev->act_log->nr_elements;
+
+ /* GFP_KERNEL, we are not in anyone's write-out path */
+ bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
+ if (!bios)
+ goto submit_one_by_one;
+
+ atomic_set(&wc.count, 0);
+ init_completion(&wc.io_done);
+ wc.mdev = mdev;
+ wc.error = 0;
+
+ for (i = 0; i < nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ /* next statement also does atomic_inc wc.count and local_cnt */
+ if (atodb_prepare_unless_covered(mdev, bios,
+ enr/AL_EXT_PER_BM_SECT,
+ &wc))
+ goto free_bios_submit_one_by_one;
+ }
+
+ /* unnecessary optimization? */
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+
+ /* all prepared, submit them */
+ for (i = 0; i < nr_elements; i++) {
+ if (bios[i] == NULL)
+ break;
+ if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
+ bios[i]->bi_rw = WRITE;
+ bio_endio(bios[i], -EIO);
+ } else {
+ submit_bio(WRITE, bios[i]);
+ }
+ }
+
+ drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+
+ /* always (try to) flush bitmap to stable storage */
+ drbd_md_flush(mdev);
+
+ /* In case we did not submit a single IO do not wait for
+ * them to complete. ( Because we would wait forever here. )
+ *
+ * In case we had IOs and they are already complete, there
+ * is not point in waiting anyways.
+ * Therefore this if () ... */
+ if (atomic_read(&wc.count))
+ wait_for_completion(&wc.io_done);
+
+ put_ldev(mdev);
+
+ kfree(bios);
+ return;
+
+ free_bios_submit_one_by_one:
+ /* free everything by calling the endio callback directly. */
+ for (i = 0; i < nr_elements && bios[i]; i++)
+ bio_endio(bios[i], 0);
+
+ kfree(bios);
+
+ submit_one_by_one:
+ dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ /* Really slow: if we have al-extents 16..19 active,
+ * sector 4 will be written four times! Synchronous! */
+ drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+ }
+
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+ put_ldev(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev: DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+ unsigned int enr;
+ unsigned long add = 0;
+ char ppb[10];
+ int i;
+
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+ if (enr == LC_FREE)
+ continue;
+ add += drbd_bm_ALe_set_all(mdev, enr);
+ }
+
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+
+ dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+ ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+ int rv;
+
+ spin_lock_irq(&mdev->al_lock);
+ rv = (al_ext->refcnt == 0);
+ if (likely(rv))
+ lc_del(mdev->act_log, al_ext);
+ spin_unlock_irq(&mdev->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev: DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+ struct lc_element *al_ext;
+ int i;
+
+ D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+ for (i = 0; i < mdev->act_log->nr_elements; i++) {
+ al_ext = lc_element_by_index(mdev->act_log, i);
+ if (al_ext->lc_number == LC_FREE)
+ continue;
+ wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+ }
+
+ wake_up(&mdev->al_wait);
+}
+
+static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+ if (!get_ldev(mdev)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+ kfree(udw);
+ return 1;
+ }
+
+ drbd_bm_write_sect(mdev, udw->enr);
+ put_ldev(mdev);
+
+ kfree(udw);
+
+ if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+ switch (mdev->state.conn) {
+ case C_SYNC_SOURCE: case C_SYNC_TARGET:
+ case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+ drbd_resync_finished(mdev);
+ default:
+ /* nothing to do */
+ break;
+ }
+ }
+ drbd_bcast_sync_progress(mdev);
+
+ return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+ int count, int success)
+{
+ struct lc_element *e;
+ struct update_odbm_work *udw;
+
+ unsigned int enr;
+
+ D_ASSERT(atomic_read(&mdev->local_cnt));
+
+ /* I simply assume that a sector/size pair never crosses
+ * a 16 MB extent border. (Currently this is true...) */
+ enr = BM_SECT_TO_EXT(sector);
+
+ e = lc_get(mdev->resync, enr);
+ if (e) {
+ struct bm_extent *ext = lc_entry(e, struct bm_extent,