aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/drbd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/Kconfig10
-rw-r--r--drivers/block/drbd/Makefile2
-rw-r--r--drivers/block/drbd/drbd_actlog.c1570
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1293
-rw-r--r--drivers/block/drbd/drbd_int.h2488
-rw-r--r--drivers/block/drbd/drbd_interval.c207
-rw-r--r--drivers/block/drbd/drbd_interval.h40
-rw-r--r--drivers/block/drbd/drbd_main.c4936
-rw-r--r--drivers/block/drbd/drbd_nl.c4362
-rw-r--r--drivers/block/drbd/drbd_nla.c54
-rw-r--r--drivers/block/drbd/drbd_nla.h8
-rw-r--r--drivers/block/drbd/drbd_proc.c248
-rw-r--r--drivers/block/drbd/drbd_protocol.h307
-rw-r--r--drivers/block/drbd/drbd_receiver.c5212
-rw-r--r--drivers/block/drbd/drbd_req.c1843
-rw-r--r--drivers/block/drbd/drbd_req.h248
-rw-r--r--drivers/block/drbd/drbd_state.c1884
-rw-r--r--drivers/block/drbd/drbd_state.h161
-rw-r--r--drivers/block/drbd/drbd_strings.c9
-rw-r--r--drivers/block/drbd/drbd_strings.h9
-rw-r--r--drivers/block/drbd/drbd_vli.h2
-rw-r--r--drivers/block/drbd/drbd_worker.c1901
-rw-r--r--drivers/block/drbd/drbd_wrappers.h77
23 files changed, 16097 insertions, 10774 deletions
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
index df098378739..7845bd6ee41 100644
--- a/drivers/block/drbd/Kconfig
+++ b/drivers/block/drbd/Kconfig
@@ -2,13 +2,14 @@
# DRBD device driver configuration
#
-comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
- depends on PROC_FS='n' || INET='n' || CONNECTOR='n'
+comment "DRBD disabled because PROC_FS or INET not selected"
+ depends on PROC_FS='n' || INET='n'
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
- depends on PROC_FS && INET && CONNECTOR
+ depends on PROC_FS && INET
select LRU_CACHE
+ select LIBCRC32C
default n
help
@@ -58,7 +59,8 @@ config DRBD_FAULT_INJECTION
32 data read
64 read ahead
128 kmalloc of bitmap
- 256 allocation of EE (epoch_entries)
+ 256 allocation of peer_requests
+ 512 insert data corruption on receiving side
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 0d3f337ff5f..8b450338075 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -1,5 +1,7 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ba95cba192b..05a1780ffa8 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -24,811 +24,695 @@
*/
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
#include "drbd_int.h"
-#include "drbd_wrappers.h"
-/* We maintain a trivial check sum in our on disk activity log.
- * With that we can ensure correct operation even when the storage
- * device might do a partial (last) sector write while loosing power.
- */
-struct __packed al_transaction {
- u32 magic;
- u32 tr_number;
- struct __packed {
- u32 pos;
- u32 extent; } updates[1 + AL_EXTENTS_PT];
- u32 xor_sum;
+
+enum al_transaction_types {
+ AL_TR_UPDATE = 0,
+ AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+ /* don't we all like magic */
+ __be32 magic;
+
+ /* to identify the most recent transaction block
+ * in the on disk ring buffer */
+ __be32 tr_number;
+
+ /* checksum on the full 4k block, with this field set to 0. */
+ __be32 crc32c;
+
+ /* type of transaction, special transaction types like:
+ * purge-all, set-all-idle, set-all-active, ... to-be-defined
+ * see also enum al_transaction_types */
+ __be16 transaction_type;
+
+ /* we currently allow only a few thousand extents,
+ * so 16bit will be enough for the slot number. */
+
+ /* how many updates in this transaction */
+ __be16 n_updates;
+
+ /* maximum slot number, "al-extents" in drbd.conf speak.
+ * Having this in each transaction should make reconfiguration
+ * of that parameter easier. */
+ __be16 context_size;
+
+ /* slot number the context starts with */
+ __be16 context_start_slot_nr;
+
+ /* Some reserved bytes. Expected usage is a 64bit counter of
+ * sectors-written since device creation, and other data generation tag
+ * supporting usage */
+ __be32 __reserved[4];
+
+ /* --- 36 byte used --- */
+
+ /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+ * in one transaction, then use the remaining byte in the 4k block for
+ * context information. "Flexible" number of updates per transaction
+ * does not help, as we have to account for the case when all update
+ * slots are used anyways, so it would only complicate code without
+ * additional benefit.
+ */
+ __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* but the extent number is 32bit, which at an extent size of 4 MiB
+ * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+ __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* --- 420 bytes used (36 + 64*6) --- */
+
+ /* 4096 - 420 = 3676 = 919 * 4 */
+ __be32 context[AL_CONTEXT_PER_TRANSACTION];
};
struct update_odbm_work {
struct drbd_work w;
+ struct drbd_device *device;
unsigned int enr;
};
struct update_al_work {
struct drbd_work w;
- struct lc_element *al_ext;
+ struct drbd_device *device;
struct completion event;
- unsigned int enr;
- /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
- unsigned int old_enr;
+ int err;
};
-struct drbd_atodb_wait {
- atomic_t count;
- struct completion io_done;
- struct drbd_conf *mdev;
- int error;
-};
+void *drbd_md_get_buffer(struct drbd_device *device)
+{
+ int r;
-int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+ wait_event(device->misc_wait,
+ (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+ device->state.disk <= D_FAILED);
-static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+ return r ? NULL : page_address(device->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+ if (atomic_dec_and_test(&device->md_io_in_use))
+ wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ unsigned int *done)
+{
+ long dt;
+
+ rcu_read_lock();
+ dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+ rcu_read_unlock();
+ dt = dt * HZ / 10;
+ if (dt == 0)
+ dt = MAX_SCHEDULE_TIMEOUT;
+
+ dt = wait_event_timeout(device->misc_wait,
+ *done || test_bit(FORCE_DETACH, &device->flags), dt);
+ if (dt == 0) {
+ drbd_err(device, "meta-data IO operation timed out\n");
+ drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+ }
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
struct drbd_backing_dev *bdev,
struct page *page, sector_t sector,
int rw, int size)
{
struct bio *bio;
- struct drbd_md_io md_io;
- int ok;
+ int err;
- md_io.mdev = mdev;
- init_completion(&md_io.event);
- md_io.error = 0;
+ device->md_io.done = 0;
+ device->md_io.error = -ENODEV;
- if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
- rw |= REQ_FUA;
- rw |= REQ_UNPLUG | REQ_SYNC;
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+ rw |= REQ_FUA | REQ_FLUSH;
+ rw |= REQ_SYNC;
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
- bio->bi_sector = sector;
- ok = (bio_add_page(bio, page, size, 0) == size);
- if (!ok)
+ bio->bi_iter.bi_sector = sector;
+ err = -EIO;
+ if (bio_add_page(bio, page, size, 0) != size)
goto out;
- bio->bi_private = &md_io;
+ bio->bi_private = &device->md_io;
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
- if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+ /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+ ;
+ else if (!get_ldev_if_state(device, D_ATTACHING)) {
+ /* Corresponding put_ldev in drbd_md_io_complete() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ bio_get(bio); /* one bio_put() is in the completion handler */
+ atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
submit_bio(rw, bio);
- wait_for_completion(&md_io.event);
- ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+ wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+ if (bio_flagged(bio, BIO_UPTODATE))
+ err = device->md_io.error;
out:
bio_put(bio);
- return ok;
+ return err;
}
-int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
sector_t sector, int rw)
{
- int logical_block_size, mask, ok;
- int offset = 0;
- struct page *iop = mdev->md_io_page;
+ int err;
+ struct page *iop = device->md_io_page;
- D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+ D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
- logical_block_size = bdev_logical_block_size(bdev->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- /* in case logical_block_size != 512 [ s390 only? ] */
- if (logical_block_size != MD_SECTOR_SIZE) {
- mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
- D_ASSERT(mask == 1 || mask == 3 || mask == 7);
- D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
- offset = sector & mask;
- sector = sector & ~mask;
- iop = mdev->md_io_tmpp;
-
- if (rw & WRITE) {
- /* these are GFP_KERNEL pages, pre-allocated
- * on device initialization */
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
- READ, logical_block_size);
-
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
- "READ [logical_block_size!=512]) failed!\n",
- (unsigned long long)sector);
- return 0;
- }
-
- memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
- }
- }
+ dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (void*)_RET_IP_ );
if (sector < drbd_md_first_sector(bdev) ||
- sector > drbd_md_last_sector(bdev))
- dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+ sector + 7 > drbd_md_last_sector(bdev))
+ drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- return 0;
- }
-
- if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+ /* we do all our meta data IO in aligned 4k blocks. */
+ err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+ if (err) {
+ drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
}
-
- return ok;
+ return err;
}
-static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
{
- struct lc_element *al_ext;
struct lc_element *tmp;
- unsigned long al_flags = 0;
-
- spin_lock_irq(&mdev->al_lock);
- tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
if (unlikely(tmp != NULL)) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
- if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
- spin_unlock_irq(&mdev->al_lock);
- return NULL;
- }
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+ return bm_ext;
}
- al_ext = lc_get(mdev->act_log, enr);
- al_flags = mdev->act_log->flags;
- spin_unlock_irq(&mdev->al_lock);
-
- /*
- if (!al_ext) {
- if (al_flags & LC_STARVING)
- dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
- if (al_flags & LC_DIRTY)
- dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
- }
- */
-
- return al_ext;
+ return NULL;
}
-void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
struct lc_element *al_ext;
- struct update_al_work al_work;
-
- D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
-
- wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
-
- if (al_ext->lc_number != enr) {
- /* drbd_al_write_transaction(mdev,al_ext,enr);
- * recurses into generic_make_request(), which
- * disallows recursion, bios being serialized on the
- * current->bio_tail list now.
- * we have to delegate updates to the activity log
- * to the worker thread. */
- init_completion(&al_work.event);
- al_work.al_ext = al_ext;
- al_work.enr = enr;
- al_work.old_enr = al_ext->lc_number;
- al_work.w.cb = w_al_write_transaction;
- drbd_queue_work_front(&mdev->data.work, &al_work.w);
- wait_for_completion(&al_work.event);
-
- mdev->al_writ_cnt++;
+ struct bm_extent *bm_ext;
+ int wake;
- spin_lock_irq(&mdev->al_lock);
- lc_changed(mdev->act_log, al_ext);
- spin_unlock_irq(&mdev->al_lock);
- wake_up(&mdev->al_wait);
+ spin_lock_irq(&device->al_lock);
+ bm_ext = find_active_resync_extent(device, enr);
+ if (bm_ext) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
+ return NULL;
}
+ if (nonblock)
+ al_ext = lc_try_get(device->act_log, enr);
+ else
+ al_ext = lc_get(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
+ return al_ext;
}
-void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
- struct lc_element *extent;
- unsigned long flags;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
- spin_lock_irqsave(&mdev->al_lock, flags);
+ D_ASSERT(device, (unsigned)(last - first) <= 1);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
- extent = lc_find(mdev->act_log, enr);
+ /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+ if (first != last)
+ return false;
- if (!extent) {
- spin_unlock_irqrestore(&mdev->al_lock, flags);
- dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
- return;
- }
-
- if (lc_put(mdev->act_log, extent) == 0)
- wake_up(&mdev->al_wait);
-
- spin_unlock_irqrestore(&mdev->al_lock, flags);
+ return _al_get(device, first, true);
}
-int
-w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
{
- struct update_al_work *aw = container_of(w, struct update_al_work, w);
- struct lc_element *updated = aw->al_ext;
- const unsigned int new_enr = aw->enr;
- const unsigned int evicted = aw->old_enr;
- struct al_transaction *buffer;
- sector_t sector;
- int i, n, mx;
- unsigned int extent_nr;
- u32 xor_sum = 0;
-
- if (!get_ldev(mdev)) {
- dev_err(DEV,
- "disk is %s, cannot start al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
- return 1;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool need_transaction = false;
+
+ D_ASSERT(device, first <= last);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ wait_event(device->al_wait,
+ (al_ext = _al_get(device, enr, false)) != NULL);
+ if (al_ext->lc_number != enr)
+ need_transaction = true;
}
- /* do we have to do a bitmap write, first?
- * TODO reduce maximum latency:
- * submit both bios, then wait for both,
- * instead of doing two synchronous sector writes.
- * For now, we must not write the transaction,
- * if we cannot write out the bitmap of the evicted extent. */
- if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
- drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
-
- /* The bitmap write may have failed, causing a state change. */
- if (mdev->state.disk < D_INCONSISTENT) {
- dev_err(DEV,
- "disk is %s, cannot write al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
- put_ldev(mdev);
- return 1;
- }
-
- mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
- buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+ return need_transaction;
+}
- buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
- buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+static int al_write_transaction(struct drbd_device *device, bool delegate);
- n = lc_index_of(mdev->act_log, updated);
+/* When called through generic_make_request(), we must delegate
+ * activity log I/O to the worker thread: a further request
+ * submitted via generic_make_request() within the same task
+ * would be queued on current->bio_list, and would only start
+ * after this function returns (see generic_make_request()).
+ *
+ * However, if we *are* the worker, we must not delegate to ourselves.
+ */
- buffer->updates[0].pos = cpu_to_be32(n);
- buffer->updates[0].extent = cpu_to_be32(new_enr);
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+{
+ bool locked = false;
- xor_sum ^= new_enr;
+ BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
- mx = min_t(int, AL_EXTENTS_PT,
- mdev->act_log->nr_elements - mdev->al_tr_cycle);
- for (i = 0; i < mx; i++) {
- unsigned idx = mdev->al_tr_cycle + i;
- extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
- buffer->updates[i+1].pos = cpu_to_be32(idx);
- buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
- xor_sum ^= extent_nr;
- }
- for (; i < AL_EXTENTS_PT; i++) {
- buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
- buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
- xor_sum ^= LC_FREE;
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(device->al_wait,
+ device->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(device->act_log)));
+
+ if (locked) {
+ /* Double check: it may have been committed by someone else,
+ * while we have been waiting for the lock. */
+ if (device->act_log->pending_changes) {
+ bool write_al_updates;
+
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+
+ if (write_al_updates)
+ al_write_transaction(device, delegate);
+ spin_lock_irq(&device->al_lock);
+ /* FIXME
+ if (err)
+ we need an "lc_cancel" here;
+ */
+ lc_committed(device->act_log);
+ spin_unlock_irq(&device->al_lock);
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
}
- mdev->al_tr_cycle += AL_EXTENTS_PT;
- if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
- mdev->al_tr_cycle = 0;
-
- buffer->xor_sum = cpu_to_be32(xor_sum);
-
- sector = mdev->ldev->md.md_offset
- + mdev->ldev->md.al_offset + mdev->al_tr_pos;
-
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
- drbd_chk_io_error(mdev, 1, TRUE);
-
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
-
- D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
- mdev->al_tr_number++;
-
- mutex_unlock(&mdev->md_io_mutex);
-
- complete(&((struct update_al_work *)w)->event);
- put_ldev(mdev);
-
- return 1;
}
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- * @b: pointer to an al_transaction.
- * @index: On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+/*
+ * @delegate: delegate activity log I/O to the worker thread
*/
-static int drbd_al_read_tr(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev,
- struct al_transaction *b,
- int index)
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
{
- sector_t sector;
- int rv, i;
- u32 xor_sum = 0;
-
- sector = bdev->md.md_offset + bdev->md.al_offset + index;
-
- /* Dont process error normally,
- * as this is done before disk is attached! */
- if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
- return -1;
-
- rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+ BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
- for (i = 0; i < AL_EXTENTS_PT + 1; i++)
- xor_sum ^= be32_to_cpu(b->updates[i].extent);
- rv &= (xor_sum == be32_to_cpu(b->xor_sum));
-
- return rv;
+ if (drbd_al_begin_io_prepare(device, i))
+ drbd_al_begin_io_commit(device, delegate);
}
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
{
- struct al_transaction *buffer;
- int i;
- int rv;
- int mx;
- int active_extents = 0;
- int transactions = 0;
- int found_valid = 0;
- int from = 0;
- int to = 0;
- u32 from_tnr = 0;
- u32 to_tnr = 0;
- u32 cnr;
-
- mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
-
- /* lock out all other meta data io for now,
- * and make sure the page is mapped.
- */
- mutex_lock(&mdev->md_io_mutex);
- buffer = page_address(mdev->md_io_page);
-
- /* Find the valid transaction in the log */
- for (i = 0; i <= mx; i++) {
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- if (rv == 0)
- continue;
- if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
- return 0;
- }
- cnr = be32_to_cpu(buffer->tr_number);
-
- if (++found_valid == 1) {
- from = i;
- to = i;
- from_tnr = cnr;
- to_tnr = cnr;
- continue;
- }
- if ((int)cnr - (int)from_tnr < 0) {
- D_ASSERT(from_tnr - cnr + i - from == mx+1);
- from = i;
- from_tnr = cnr;
- }
- if ((int)cnr - (int)to_tnr > 0) {
- D_ASSERT(cnr - to_tnr == i - to);
- to = i;
- to_tnr = cnr;
+ struct lru_cache *al = device->act_log;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned nr_al_extents;
+ unsigned available_update_slots;
+ unsigned enr;
+
+ D_ASSERT(device, first <= last);
+
+ nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+ available_update_slots = min(al->nr_elements - al->used,
+ al->max_pending_changes - al->pending_changes);
+
+ /* We want all necessary updates for a given request within the same transaction
+ * We could first check how many updates are *actually* needed,
+ * and use that instead of the worst-case nr_al_extents */
+ if (available_update_slots < nr_al_extents)
+ return -EWOULDBLOCK;
+
+ /* Is resync active in this area? */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *tmp;
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+ return -EBUSY;
+ return -EWOULDBLOCK;
+ }
}
}
- if (!found_valid) {
- dev_warn(DEV, "No usable activity log found.\n");
- mutex_unlock(&mdev->md_io_mutex);
- return 1;
+ /* Checkout the refcounts.
+ * Given that we checked for available elements and update slots above,
+ * this has to be successful. */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ al_ext = lc_get_cumulative(device->act_log, enr);
+ if (!al_ext)
+ drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
}
+ return 0;
+}
- /* Read the valid transactions.
- * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
- i = from;
- while (1) {
- int j, pos;
- unsigned int extent_nr;
- unsigned int trn;
-
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- ERR_IF(rv == 0) goto cancel;
- if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
- return 0;
- }
-
- trn = be32_to_cpu(buffer->tr_number);
-
- spin_lock_irq(&mdev->al_lock);
-
- /* This loop runs backwards because in the cyclic
- elements there might be an old version of the
- updated element (in slot 0). So the element in slot 0
- can overwrite old versions. */
- for (j = AL_EXTENTS_PT; j >= 0; j--) {
- pos = be32_to_cpu(buffer->updates[j].pos);
- extent_nr = be32_to_cpu(buffer->updates[j].extent);
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ struct lc_element *extent;
+ unsigned long flags;
- if (extent_nr == LC_FREE)
- continue;
+ D_ASSERT(device, first <= last);
+ spin_lock_irqsave(&device->al_lock, flags);
- lc_set(mdev->act_log, extent_nr, pos);
- active_extents++;
+ for (enr = first; enr <= last; enr++) {
+ extent = lc_find(device->act_log, enr);
+ if (!extent) {
+ drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+ continue;
}
- spin_unlock_irq(&mdev->al_lock);
-
- transactions++;
-
-cancel:
- if (i == to)
- break;
- i++;
- if (i > mx)
- i = 0;
+ lc_put(device->act_log, extent);
}
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ wake_up(&device->al_wait);
+}
- mdev->al_tr_number = to_tnr+1;
- mdev->al_tr_pos = to;
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
-
- /* ok, we are done with it */
- mutex_unlock(&mdev->md_io_mutex);
-
- dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
- transactions, active_extents);
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
- return 1;
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+ return al_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* al extent number to bit */
+ (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
}
-static void atodb_endio(struct bio *bio, int error)
+static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
{
- struct drbd_atodb_wait *wc = bio->bi_private;
- struct drbd_conf *mdev = wc->mdev;
- struct page *page;
- int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
- /* strange behavior of some lower level drivers...
- * fail the request by clearing the uptodate flag,
- * but do not return any error?! */
- if (!error && !uptodate)
- error = -EIO;
-
- drbd_chk_io_error(mdev, error, TRUE);
- if (error && wc->error == 0)
- wc->error = error;
-
- if (atomic_dec_and_test(&wc->count))
- complete(&wc->io_done);
-
- page = bio->bi_io_vec[0].bv_page;
- put_page(page);
- bio_put(bio);
- mdev->bm_writ_cnt++;
- put_ldev(mdev);
+ return rs_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* resync extent number to bit */
+ (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
-/* sector to word */
-#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
-
-/* activity log to on disk bitmap -- prepare bio unless that sector
- * is already covered by previously prepared bios */
-static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
- struct bio **bios,
- unsigned int enr,
- struct drbd_atodb_wait *wc) __must_hold(local)
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
{
- struct bio *bio;
- struct page *page;
- sector_t on_disk_sector;
- unsigned int page_offset = PAGE_SIZE;
- int offset;
- int i = 0;
- int err = -ENOMEM;
-
- /* We always write aligned, full 4k blocks,
- * so we can ignore the logical_block_size (for now) */
- enr &= ~7U;
- on_disk_sector = enr + mdev->ldev->md.md_offset
- + mdev->ldev->md.bm_offset;
-
- D_ASSERT(!(on_disk_sector & 7U));
-
- /* Check if that enr is already covered by an already created bio.
- * Caution, bios[] is not NULL terminated,
- * but only initialized to all NULL.
- * For completely scattered activity log,
- * the last invocation iterates over all bios,
- * and finds the last NULL entry.
- */
- while ((bio = bios[i])) {
- if (bio->bi_sector == on_disk_sector)
- return 0;
- i++;
- }
- /* bios[i] == NULL, the next not yet used slot */
-
- /* GFP_KERNEL, we are not in the write-out path */
- bio = bio_alloc(GFP_KERNEL, 1);
- if (bio == NULL)
- return -ENOMEM;
-
- if (i > 0) {
- const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
- page_offset = prev_bv->bv_offset + prev_bv->bv_len;
- page = prev_bv->bv_page;
- }
- if (page_offset == PAGE_SIZE) {
- page = alloc_page(__GFP_HIGHMEM);
- if (page == NULL)
- goto out_bio_put;
- page_offset = 0;
- } else {
- get_page(page);
- }
+ const unsigned int stripes = device->ldev->md.al_stripes;
+ const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
- offset = S2W(enr);
- drbd_bm_get_lel(mdev, offset,
- min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
- kmap(page) + page_offset);
- kunmap(page);
-
- bio->bi_private = wc;
- bio->bi_end_io = atodb_endio;
- bio->bi_bdev = mdev->ldev->md_bdev;
- bio->bi_sector = on_disk_sector;
-
- if (bio_add_page(bio, page, 4096, page_offset) != 4096)
- goto out_put_page;
-
- atomic_inc(&wc->count);
- /* we already know that we may do this...
- * get_ldev_if_state(mdev,D_ATTACHING);
- * just get the extra reference, so that the local_cnt reflects
- * the number of pending IO requests DRBD at its backing device.
- */
- atomic_inc(&mdev->local_cnt);
+ /* transaction number, modulo on-disk ring buffer wrap around */
+ unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
- bios[i] = bio;
+ /* ... to aligned 4k on disk block */
+ t = ((t % stripes) * stripe_size_4kB) + t/stripes;
- return 0;
+ /* ... to 512 byte sector in activity log */
+ t *= 8;
-out_put_page:
- err = -EINVAL;
- put_page(page);
-out_bio_put:
- bio_put(bio);
- return err;
+ /* ... plus offset to the on disk position */
+ return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
}
-/**
- * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
- * @mdev: DRBD device.
- *
- * Called when we detach (unconfigure) local storage,
- * or when we go from R_PRIMARY to R_SECONDARY role.
- */
-void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+static int
+_al_write_transaction(struct drbd_device *device)
{
- int i, nr_elements;
- unsigned int enr;
- struct bio **bios;
- struct drbd_atodb_wait wc;
-
- ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
- return; /* sorry, I don't have any act_log etc... */
-
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-
- nr_elements = mdev->act_log->nr_elements;
-
- /* GFP_KERNEL, we are not in anyone's write-out path */
- bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
- if (!bios)
- goto submit_one_by_one;
+ struct al_transaction_on_disk *buffer;
+ struct lc_element *e;
+ sector_t sector;
+ int i, mx;
+ unsigned extent_nr;
+ unsigned crc = 0;
+ int err = 0;
+
+ if (!get_ldev(device)) {
+ drbd_err(device, "disk is %s, cannot start al transaction\n",
+ drbd_disk_str(device->state.disk));
+ return -EIO;
+ }
- atomic_set(&wc.count, 0);
- init_completion(&wc.io_done);
- wc.mdev = mdev;
- wc.error = 0;
+ /* The bitmap write may have failed, causing a state change. */
+ if (device->state.disk < D_INCONSISTENT) {
+ drbd_err(device,
+ "disk is %s, cannot write al transaction\n",
+ drbd_disk_str(device->state.disk));
+ put_ldev(device);
+ return -EIO;
+ }
- for (i = 0; i < nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- /* next statement also does atomic_inc wc.count and local_cnt */
- if (atodb_prepare_unless_covered(mdev, bios,
- enr/AL_EXT_PER_BM_SECT,
- &wc))
- goto free_bios_submit_one_by_one;
+ buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+ if (!buffer) {
+ drbd_err(device, "disk failed while waiting for md_io buffer\n");
+ put_ldev(device);
+ return -ENODEV;
}
- /* unnecessary optimization? */
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
+ memset(buffer, 0, sizeof(*buffer));
+ buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+ i = 0;
- /* all prepared, submit them */
- for (i = 0; i < nr_elements; i++) {
- if (bios[i] == NULL)
+ /* Even though no one can start to change this list
+ * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+ * lc_try_lock_for_transaction() --, someone may still
+ * be in the process of changing it. */
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+ if (i == AL_UPDATES_PER_TRANSACTION) {
+ i++;
break;
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
- bios[i]->bi_rw = WRITE;
- bio_endio(bios[i], -EIO);
- } else {
- submit_bio(WRITE, bios[i]);
}
+ buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+ buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+ if (e->lc_number != LC_FREE)
+ drbd_bm_mark_for_writeout(device,
+ al_extent_to_bm_page(e->lc_number));
+ i++;
}
+ spin_unlock_irq(&device->al_lock);
+ BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
- drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
-
- /* always (try to) flush bitmap to stable storage */
- drbd_md_flush(mdev);
-
- /* In case we did not submit a single IO do not wait for
- * them to complete. ( Because we would wait forever here. )
- *
- * In case we had IOs and they are already complete, there
- * is not point in waiting anyways.
- * Therefore this if () ... */
- if (atomic_read(&wc.count))
- wait_for_completion(&wc.io_done);
+ buffer->n_updates = cpu_to_be16(i);
+ for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+ buffer->update_slot_nr[i] = cpu_to_be16(-1);
+ buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+ }
- put_ldev(mdev);
+ buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+ buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
- kfree(bios);
- return;
+ mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+ device->act_log->nr_elements - device->al_tr_cycle);
+ for (i = 0; i < mx; i++) {
+ unsigned idx = device->al_tr_cycle + i;
+ extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+ buffer->context[i] = cpu_to_be32(extent_nr);
+ }
+ for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+ buffer->context[i] = cpu_to_be32(LC_FREE);
- free_bios_submit_one_by_one:
- /* free everything by calling the endio callback directly. */
- for (i = 0; i < nr_elements && bios[i]; i++)
- bio_endio(bios[i], 0);
+ device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+ if (device->al_tr_cycle >= device->act_log->nr_elements)
+ device->al_tr_cycle = 0;
- kfree(bios);
+ sector = al_tr_number_to_on_disk_sector(device);
- submit_one_by_one:
- dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+ crc = crc32c(0, buffer, 4096);
+ buffer->crc32c = cpu_to_be32(crc);
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- /* Really slow: if we have al-extents 16..19 active,
- * sector 4 will be written four times! Synchronous! */
- drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+ if (drbd_bm_write_hinted(device))
+ err = -EIO;
+ else {
+ bool write_al_updates;
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+ if (write_al_updates) {
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ } else {
+ device->al_tr_number++;
+ device->al_writ_cnt++;
+ }
+ }
}
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
- put_ldev(mdev);
+ drbd_md_put_buffer(device);
+ put_ldev(device);
+
+ return err;
}
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
- * @mdev: DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
-{
- unsigned int enr;
- unsigned long add = 0;
- char ppb[10];
- int i, tmp;
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+static int w_al_write_transaction(struct drbd_work *w, int unused)
+{
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct drbd_device *device = aw->device;
+ int err;
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- tmp = drbd_bm_ALe_set_all(mdev, enr);
- dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
- add += tmp;
- }
+ err = _al_write_transaction(device);
+ aw->err = err;
+ complete(&aw->event);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
+ return err != -EIO ? err : 0;
+}
- dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
- ppsize(ppb, Bit2KB(add)));
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+ transaction directly. Others came through generic_make_request(),
+ those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_device *device, bool delegate)
+{
+ if (delegate) {
+ struct update_al_work al_work;
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.device = device;
+ drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+ &al_work.w);
+ wait_for_completion(&al_work.event);
+ return al_work.err;
+ } else
+ return _al_write_transaction(device);
}
-static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
{
int rv;
- spin_lock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
rv = (al_ext->refcnt == 0);
if (likely(rv))
- lc_del(mdev->act_log, al_ext);
- spin_unlock_irq(&mdev->al_lock);
+ lc_del(device->act_log, al_ext);
+ spin_unlock_irq(&device->al_lock);
return rv;
}
/**
* drbd_al_shrink() - Removes all active extents form the activity log
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Removes all active extents form the activity log, waiting until
* the reference count of each entry dropped to 0 first, of course.
*
- * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
*/
-void drbd_al_shrink(struct drbd_conf *mdev)
+void drbd_al_shrink(struct drbd_device *device)
{
struct lc_element *al_ext;
int i;
- D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+ D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- al_ext = lc_element_by_index(mdev->act_log, i);
+ for (i = 0; i < device->act_log->nr_elements; i++) {
+ al_ext = lc_element_by_index(device->act_log, i);
if (al_ext->lc_number == LC_FREE)
continue;
- wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+ wait_event(device->al_wait, _try_lc_del(device, al_ext));
}
- wake_up(&mdev->al_wait);
+ wake_up(&device->al_wait);
}
-static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+int drbd_initialize_al(struct drbd_device *device, void *buffer)
+{
+ struct al_transaction_on_disk *al = buffer;
+ struct drbd_md *md = &device->ldev->md;
+ sector_t al_base = md->md_offset + md->al_offset;
+ int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+ int i;
+
+ memset(al, 0, 4096);
+ al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+ al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+ for (i = 0; i < al_size_4k; i++) {
+ int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int w_update_odbm(struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+ struct drbd_device *device = udw->device;
+ struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
- if (!get_ldev(mdev)) {
+ if (!get_ldev(device)) {
if (__ratelimit(&drbd_ratelimit_state))
- dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+ drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
kfree(udw);
- return 1;
+ return 0;
}
- drbd_bm_write_sect(mdev, udw->enr);
- put_ldev(mdev);
+ drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
+ put_ldev(device);
kfree(udw);
- if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
- switch (mdev->state.conn) {
+ if (drbd_bm_total_weight(device) <= device->rs_failed) {
+ switch (device->state.conn) {
case C_SYNC_SOURCE: case C_SYNC_TARGET:
case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
- drbd_resync_finished(mdev);
+ drbd_resync_finished(device);
default:
/* nothing to do */
break;
}
}
- drbd_bcast_sync_progress(mdev);
+ drbd_bcast_event(device, &sib);
- return 1;
+ return 0;
}
@@ -838,7 +722,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
*
* TODO will be obsoleted once we have a caching lru of the on disk bitmap
*/
-static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
int count, int success)
{
struct lc_element *e;
@@ -846,13 +730,13 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
unsigned int enr;
- D_ASSERT(atomic_read(&mdev->local_cnt));
+ D_ASSERT(device, atomic_read(&device->local_cnt));
/* I simply assume that a sector/size pair never crosses
* a 16 MB extent border. (Currently this is true...) */
enr = BM_SECT_TO_EXT(sector);
- e = lc_get(mdev->resync, enr);
+ e = lc_get(device->resync, enr);
if (e) {
struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
if (ext->lce.lc_number == enr) {
@@ -861,16 +745,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
else
ext->rs_failed += count;
if (ext->rs_left < ext->rs_failed) {
- dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
- "rs_failed=%d count=%d\n",
+ drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+ "rs_failed=%d count=%d cstate=%s\n",
(unsigned long long)sector,
ext->lce.lc_number, ext->rs_left,
- ext->rs_failed, count);
- dump_stack();
-
- lc_put(mdev->resync, &ext->lce);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return;
+ ext->rs_failed, count,
+ drbd_conn_str(device->state.conn));
+
+ /* We don't expect to be able to clear more bits
+ * than have been set when we originally counted
+ * the set bits to cache that value in ext->rs_left.
+ * Whatever the reason (disconnect during resync,
+ * delayed local completion of an application write),
+ * try to fix it up by recounting here. */
+ ext->rs_left = drbd_bm_e_weight(device, enr);
}
} else {
/* Normally this element should be in the cache,
@@ -879,25 +767,26 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
* But maybe an application write finished, and we set
* something outside the resync lru_cache in sync.
*/
- int rs_left = drbd_bm_e_weight(mdev, enr);
+ int rs_left = drbd_bm_e_weight(device, enr);
if (ext->flags != 0) {
- dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+ drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
" -> %d[%u;00]\n",
ext->lce.lc_number, ext->rs_left,
ext->flags, enr, rs_left);
ext->flags = 0;
}
if (ext->rs_failed) {
- dev_warn(DEV, "Kicking resync_lru element enr=%u "
+ drbd_warn(device, "Kicking resync_lru element enr=%u "
"out with rs_failed=%d\n",
ext->lce.lc_number, ext->rs_failed);
- set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
- lc_changed(mdev->resync, &ext->lce);
+ /* we don't keep a persistent log of the resync lru,
+ * we can commit any change right away. */
+ lc_committed(device->resync);
}
- lc_put(mdev->resync, &ext->lce);
+ lc_put(device->resync, &ext->lce);
/* no race, we are within the al_lock! */
if (ext->rs_left == ext->rs_failed) {
@@ -907,17 +796,34 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
if (udw) {
udw->enr = ext->lce.lc_number;
udw->w.cb = w_update_odbm;
- drbd_queue_work_front(&mdev->data.work, &udw->w);
+ udw->device = device;
+ drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+ &udw->w);
} else {
- dev_warn(DEV, "Could not kmalloc an udw\n");
- set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+ drbd_warn(device, "Could not kmalloc an udw\n");
}
}
} else {
- dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
- mdev->resync_locked,
- mdev->resync->nr_elements,
- mdev->resync->flags);
+ drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+ device->resync_locked,
+ device->resync->nr_elements,
+ device->resync->flags);
+ }
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+ unsigned long now = jiffies;
+ unsigned long last = device->rs_mark_time[device->rs_last_mark];
+ int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+ device->state.conn != C_PAUSED_SYNC_T &&
+ device->state.conn != C_PAUSED_SYNC_S) {
+ device->rs_mark_time[next] = now;
+ device->rs_mark_left[next] = still_to_go;
+ device->rs_last_mark = next;
+ }
}
}
@@ -928,7 +834,7 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
* called by worker on C_SYNC_TARGET and receiver on SyncSource.
*
*/
-void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
const char *file, const unsigned int line)
{
/* Is called from worker and receiver context _only_ */
@@ -938,16 +844,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int wake_up = 0;
unsigned long flags;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
- dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
- nr_sectors = drbd_get_capacity(mdev->this_bdev);
+
+ if (!get_ldev(device))
+ return; /* no disk, no metadata, no bitmap to clear bits in */
+
+ nr_sectors = drbd_get_capacity(device->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ goto out;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@@ -955,7 +867,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* round up start sector, round down end sector. we make sure we only
* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
- return;
+ goto out;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
@@ -963,74 +875,65 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
if (sbnr > ebnr)
- return;
+ goto out;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
- count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
- if (count && get_ldev(mdev)) {
- unsigned long now = jiffies;
- unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
- int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
- if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
- unsigned long tw = drbd_bm_total_weight(mdev);
- if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
- mdev->state.conn != C_PAUSED_SYNC_T &&
- mdev->state.conn != C_PAUSED_SYNC_S) {
- mdev->rs_mark_time[next] = now;
- mdev->rs_mark_left[next] = tw;
- mdev->rs_last_mark = next;
- }
- }
- spin_lock_irqsave(&mdev->al_lock, flags);
- drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
- spin_unlock_irqrestore(&mdev->al_lock, flags);
+ count = drbd_bm_clear_bits(device, sbnr, ebnr);
+ if (count) {
+ drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
+ spin_lock_irqsave(&device->al_lock, flags);
+ drbd_try_clear_on_disk_bm(device, sector, count, true);
+ spin_unlock_irqrestore(&device->al_lock, flags);
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
- put_ldev(mdev);
}
+out:
+ put_ldev(device);
if (wake_up)
- wake_up(&mdev->al_wait);
+ wake_up(&device->al_wait);
}
/*
* this is intended to set one request worth of data out of sync.
* affects at least 1 bit,
- * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
*
* called by tl_clear and drbd_send_dblock (==drbd_make_request).
* so this can be _any_ process.
*/
-void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
const char *file, const unsigned int line)
{
- unsigned long sbnr, ebnr, lbnr, flags;
+ unsigned long sbnr, ebnr, flags;
sector_t esector, nr_sectors;
- unsigned int enr, count;
+ unsigned int enr, count = 0;
struct lc_element *e;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
- dev_err(DEV, "sector: %llus, size: %d\n",
+ /* this should be an empty REQ_FLUSH */
+ if (size == 0)
+ return 0;
+
+ if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ drbd_err(device, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
- return;
+ return 0;
}
- if (!get_ldev(mdev))
- return; /* no disk, no metadata, no bitmap to set bits in */
+ if (!get_ldev(device))
+ return 0; /* no disk, no metadata, no bitmap to set bits in */
- nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ nr_sectors = drbd_get_capacity(device->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors)
+ if (!expect(sector < nr_sectors))
goto out;
- ERR_IF(esector >= nr_sectors)
- esector = (nr_sectors-1);
-
- lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
/* we set it out of sync,
* we do not need to round anything here */
@@ -1039,118 +942,117 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
/* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors. */
- spin_lock_irqsave(&mdev->al_lock, flags);
- count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+ spin_lock_irqsave(&device->al_lock, flags);
+ count = drbd_bm_set_bits(device, sbnr, ebnr);
enr = BM_SECT_TO_EXT(sector);
- e = lc_find(mdev->resync, enr);
+ e = lc_find(device->resync, enr);
if (e)
lc_entry(e, struct bm_extent, lce)->rs_left += count;
- spin_unlock_irqrestore(&mdev->al_lock, flags);
+ spin_unlock_irqrestore(&device->al_lock, flags);
out:
- put_ldev(mdev);
+ put_ldev(device);
+
+ return count;
}
static
-struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
{
struct lc_element *e;
struct bm_extent *bm_ext;
int wakeup = 0;
unsigned long rs_flags;
- spin_lock_irq(&mdev->al_lock);
- if (mdev->resync_locked > mdev->resync->nr_elements/2) {
- spin_unlock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_locked > device->resync->nr_elements/2) {
+ spin_unlock_irq(&device->al_lock);
return NULL;
}
- e = lc_get(mdev->resync, enr);
+ e = lc_get(device->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
if (bm_ext->lce.lc_number != enr) {
- bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
+ lc_committed(device->resync);
wakeup = 1;
}
if (bm_ext->lce.refcnt == 1)
- mdev->resync_locked++;
+ device->resync_locked++;
set_bit(BME_NO_WRITES, &bm_ext->flags);
}
- rs_flags = mdev->resync->flags;
- spin_unlock_irq(&mdev->al_lock);
+ rs_flags = device->resync->flags;
+ spin_unlock_irq(&device->al_lock);
if (wakeup)
- wake_up(&mdev->al_wait);
+ wake_up(&device->al_wait);
if (!bm_ext) {
if (rs_flags & LC_STARVING)
- dev_warn(DEV, "Have to wait for element"
+ drbd_warn(device, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
}
return bm_ext;
}
-static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
{
- struct lc_element *al_ext;
- int rv = 0;
+ int rv;
- spin_lock_irq(&mdev->al_lock);
- if (unlikely(enr == mdev->act_log->new_number))
- rv = 1;
- else {
- al_ext = lc_find(mdev->act_log, enr);
- if (al_ext) {
- if (al_ext->refcnt)
- rv = 1;
- }
- }
- spin_unlock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
+ rv = lc_is_used(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
- /*
- if (unlikely(rv)) {
- dev_info(DEV, "Delaying sync read until app's write is done\n");
- }
- */
return rv;
}
/**
* drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @sector: The sector number.
*
* This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
*/
-int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
struct bm_extent *bm_ext;
int i, sig;
+ bool sa;
- sig = wait_event_interruptible(mdev->al_wait,
- (bm_ext = _bme_get(mdev, enr)));
+retry:
+ sig = wait_event_interruptible(device->al_wait,
+ (bm_ext = _bme_get(device, enr)));
if (sig)
return -EINTR;
if (test_bit(BME_LOCKED, &bm_ext->flags))
return 0;
+ /* step aside only while we are above c-min-rate; unless disabled. */
+ sa = drbd_rs_c_min_rate_throttle(device);
+
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
- sig = wait_event_interruptible(mdev->al_wait,
- !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
- if (sig) {
- spin_lock_irq(&mdev->al_lock);
- if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
- clear_bit(BME_NO_WRITES, &bm_ext->flags);
- mdev->resync_locked--;
- wake_up(&mdev->al_wait);
+ sig = wait_event_interruptible(device->al_wait,
+ !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+ (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+ if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+ spin_lock_irq(&device->al_lock);
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
}
- spin_unlock_irq(&mdev->al_lock);
- return -EINTR;
+ spin_unlock_irq(&device->al_lock);
+ if (sig)
+ return -EINTR;
+ if (schedule_timeout_interruptible(HZ/10))
+ return -EINTR;
+ goto retry;
}
}
set_bit(BME_LOCKED, &bm_ext->flags);
@@ -1159,14 +1061,14 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
/**
* drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @sector: The sector number.
*
* Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
* tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
* if there is still application IO going on in this area.
*/
-int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
@@ -1174,8 +1076,8 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
struct bm_extent *bm_ext;
int i;
- spin_lock_irq(&mdev->al_lock);
- if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
/* in case you have very heavy scattered io, it may
* stall the syncer undefined if we give up the ref count
* when we try again and requeue.
@@ -1189,195 +1091,193 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
* the lc_put here...
* we also have to wake_up
*/
- e = lc_find(mdev->resync, mdev->resync_wenr);
+ e = lc_find(device->resync, device->resync_wenr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
- D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
- D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
clear_bit(BME_NO_WRITES, &bm_ext->flags);
- mdev->resync_wenr = LC_FREE;
- if (lc_put(mdev->resync, &bm_ext->lce) == 0)
- mdev->resync_locked--;
- wake_up(&mdev->al_wait);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0)
+ device->resync_locked--;
+ wake_up(&device->al_wait);
} else {
- dev_alert(DEV, "LOGIC BUG\n");
+ drbd_alert(device, "LOGIC BUG\n");
}
}
/* TRY. */
- e = lc_try_get(mdev->resync, enr);
+ e = lc_try_get(device->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
if (test_bit(BME_LOCKED, &bm_ext->flags))
goto proceed;
if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
- mdev->resync_locked++;
+ device->resync_locked++;
} else {
/* we did set the BME_NO_WRITES,
* but then could not set BME_LOCKED,
* so we tried again.
* drop the extra reference. */
bm_ext->lce.refcnt--;
- D_ASSERT(bm_ext->lce.refcnt > 0);
+ D_ASSERT(device, bm_ext->lce.refcnt > 0);
}
goto check_al;
} else {
/* do we rather want to try later? */
- if (mdev->resync_locked > mdev->resync->nr_elements-3)
+ if (device->resync_locked > device->resync->nr_elements-3)
goto try_again;
/* Do or do not. There is no try. -- Yoda */
- e = lc_get(mdev->resync, enr);
+ e = lc_get(device->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
- const unsigned long rs_flags = mdev->resync->flags;
+ const unsigned long rs_flags = device->resync->flags;
if (rs_flags & LC_STARVING)
- dev_warn(DEV, "Have to wait for element"
+ drbd_warn(device, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
goto try_again;
}
if (bm_ext->lce.lc_number != enr) {
- bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
- wake_up(&mdev->al_wait);
- D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+ lc_committed(device->resync);
+ wake_up(&device->al_wait);
+ D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
}
set_bit(BME_NO_WRITES, &bm_ext->flags);
- D_ASSERT(bm_ext->lce.refcnt == 1);
- mdev->resync_locked++;
+ D_ASSERT(device, bm_ext->lce.refcnt == 1);
+ device->resync_locked++;
goto check_al;
}
check_al:
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
- if (unlikely(al_enr+i == mdev->act_log->new_number))
- goto try_again;
- if (lc_is_used(mdev->act_log, al_enr+i))
+ if (lc_is_used(device->act_log, al_enr+i))
goto try_again;
}
set_bit(BME_LOCKED, &bm_ext->flags);
proceed:
- mdev->resync_wenr = LC_FREE;
- spin_unlock_irq(&mdev->al_lock);
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
return 0;
try_again:
if (bm_ext)
- mdev->resync_wenr = enr;
- spin_unlock_irq(&mdev->al_lock);
+ device->resync_wenr = enr;
+ spin_unlock_irq(&device->al_lock);
return -EAGAIN;
}
-void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
struct lc_element *e;
struct bm_extent *bm_ext;
unsigned long flags;
- spin_lock_irqsave(&mdev->al_lock, flags);
- e = lc_find(mdev->resync, enr);
+ spin_lock_irqsave(&device->al_lock, flags);
+ e = lc_find(device->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
- spin_unlock_irqrestore(&mdev->al_lock, flags);
+ spin_unlock_irqrestore(&device->al_lock, flags);
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+ drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
return;
}
if (bm_ext->lce.refcnt == 0) {
- spin_unlock_irqrestore(&mdev->al_lock, flags);
- dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
"but refcnt is 0!?\n",
(unsigned long long)sector, enr);
return;
}
- if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
- clear_bit(BME_LOCKED, &bm_ext->flags);
- clear_bit(BME_NO_WRITES, &bm_ext->flags);
- mdev->resync_locked--;
- wake_up(&mdev->al_wait);
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
}
- spin_unlock_irqrestore(&mdev->al_lock, flags);
+ spin_unlock_irqrestore(&device->al_lock, flags);
}
/**
* drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
- * @mdev: DRBD device.
+ * @device: DRBD device.
*/
-void drbd_rs_cancel_all(struct drbd_conf *mdev)
+void drbd_rs_cancel_all(struct drbd_device *device)
{
- spin_lock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
- if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
- lc_reset(mdev->resync);
- put_ldev(mdev);
+ if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+ lc_reset(device->resync);
+ put_ldev(device);
}
- mdev->resync_locked = 0;
- mdev->resync_wenr = LC_FREE;
- spin_unlock_irq(&mdev->al_lock);
- wake_up(&mdev->al_wait);
+ device->resync_locked = 0;
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
}
/**
* drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Returns 0 upon success, -EAGAIN if at least one reference count was
* not zero.
*/
-int drbd_rs_del_all(struct drbd_conf *mdev)
+int drbd_rs_del_all(struct drbd_device *device)
{
struct lc_element *e;
struct bm_extent *bm_ext;
int i;
- spin_lock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
- if (get_ldev_if_state(mdev, D_FAILED)) {
+ if (get_ldev_if_state(device, D_FAILED)) {
/* ok, ->resync is there. */
- for (i = 0; i < mdev->resync->nr_elements; i++) {
- e = lc_element_by_index(mdev->resync, i);
+ for (i = 0; i < device->resync->nr_elements; i++) {
+ e = lc_element_by_index(device->resync, i);
bm_ext = lc_entry(e, struct bm_extent, lce);
if (bm_ext->lce.lc_number == LC_FREE)
continue;
- if (bm_ext->lce.lc_number == mdev->resync_wenr) {
- dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+ if (bm_ext->lce.lc_number == device->resync_wenr) {
+ drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
" got 'synced' by application io\n",
- mdev->resync_wenr);
- D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
- D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+ device->resync_wenr);
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
clear_bit(BME_NO_WRITES, &bm_ext->flags);
- mdev->resync_wenr = LC_FREE;
- lc_put(mdev->resync, &bm_ext->lce);
+ device->resync_wenr = LC_FREE;
+ lc_put(device->resync, &bm_ext->lce);
}
if (bm_ext->lce.refcnt != 0) {
- dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+ drbd_info(device, "Retrying drbd_rs_del_all() later. "
"refcnt=%d\n", bm_ext->lce.refcnt);
- put_ldev(mdev);
- spin_unlock_irq(&mdev->al_lock);
+ put_ldev(device);
+ spin_unlock_irq(&device->al_lock);
return -EAGAIN;
}
- D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
- D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
- lc_del(mdev->resync, &bm_ext->lce);
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+ lc_del(device->resync, &bm_ext->lce);
}
- D_ASSERT(mdev->resync->used == 0);
- put_ldev(mdev);
+ D_ASSERT(device, device->resync->used == 0);
+ put_ldev(device);
}
- spin_unlock_irq(&mdev->al_lock);
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
return 0;
}
/**
* drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @sector: The sector number.
* @size: Size of failed IO operation, in byte.
*/
-void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
{
/* Is called from worker and receiver context _only_ */
unsigned long sbnr, ebnr, lbnr;
@@ -1385,16 +1285,18 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
sector_t esector, nr_sectors;
int wake_up = 0;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
- dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
- nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ nr_sectors = drbd_get_capacity(device->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ return;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@@ -1416,21 +1318,21 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
- spin_lock_irq(&mdev->al_lock);
- count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+ spin_lock_irq(&device->al_lock);
+ count = drbd_bm_count_bits(device, sbnr, ebnr);
if (count) {
- mdev->rs_failed += count;
+ device->rs_failed += count;
- if (get_ldev(mdev)) {
- drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
- put_ldev(mdev);
+ if (get_ldev(device)) {
+ drbd_try_clear_on_disk_bm(device, sector, count, false);
+ put_ldev(device);
}
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
}
- spin_unlock_irq(&mdev->al_lock);
+ spin_unlock_irq(&device->al_lock);
if (wake_up)
- wake_up(&mdev->al_wait);
+ wake_up(&device->al_wait);
}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index fd42832f785..1aa29f8fdfe 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -28,18 +28,56 @@
#include <linux/drbd.h>
#include <linux/slab.h>
#include <asm/kmap_types.h>
+
#include "drbd_int.h"
+
/* OPAQUE outside this file!
* interface defined in drbd_int.h
* convention:
* function name drbd_bm_... => used elsewhere, "public".
* function name bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50 bytes backend storage (1 PiB)
+ * 1 << (50 - 12) bits needed
+ * 38 --> we need u64 to index and count bits
+ * 1 << (38 - 3) bitmap bytes needed
+ * 35 --> we still need u64 to index and count bytes
+ * (that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2) 32bit longs needed
+ * 33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3) 64bit longs needed
+ * 32 --> we could get away with a 32bit unsigned int to index and count
+ * 64bit long words, but I rather stay with unsigned long for now.
+ * We probably should neither count nor point to bytes or long words
+ * directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ * 22 --> we need that much 4KiB pages of bitmap.
+ * 1 << (22 + 3) --> on a 64bit arch,
+ * we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
- * Note that since find_first_bit returns int, at the current granularity of
- * the bitmap (4KB per byte), this implementation "only" supports up to
- * 1<<(32+12) == 16 TB...
+ * bitmap storage and IO:
+ * Bitmap is stored little endian on disk, and is kept little endian in
+ * core memory. Currently we still hold the full bitmap in core as long
+ * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ * seems excessive.
+ *
+ * We plan to reduce the amount of in-core bitmap pages by paging them in
+ * and out against their on-disk location as necessary, but need to make
+ * sure we don't cause too much meta data IO, and must not deadlock in
+ * tight memory situations. This needs some more work.
*/
/*
@@ -55,13 +93,9 @@
struct drbd_bitmap {
struct page **bm_pages;
spinlock_t bm_lock;
- /* WARNING unsigned long bm_*:
- * 32bit number of bit offset is just enough for 512 MB bitmap.
- * it will blow up if we make the bitmap bigger...
- * not that it makes much sense to have a bitmap that large,
- * rather change the granularity to 16k or 64k or something.
- * (that implies other problems, however...)
- */
+
+ /* see LIMITATIONS: above */
+
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
unsigned long bm_bits;
size_t bm_words;
@@ -69,118 +103,226 @@ struct drbd_bitmap {
sector_t bm_dev_capacity;
struct mutex bm_change; /* serializes resize operations */
- atomic_t bm_async_io;
- wait_queue_head_t bm_io_wait;
+ wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
- unsigned long bm_flags;
+ enum bm_flag bm_flags;
/* debugging aid, in case we are still racy somewhere */
char *bm_why;
struct task_struct *bm_task;
};
-/* definition of bits in bm_flags */
-#define BM_LOCKED 0
-#define BM_MD_IO_ERROR 1
-#define BM_P_VMALLOCED 2
-
-static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
- unsigned long e, int val, const enum km_type km);
-
-static int bm_is_locked(struct drbd_bitmap *b)
-{
- return test_bit(BM_LOCKED, &b->bm_flags);
-}
-
#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
-static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
if (!__ratelimit(&drbd_ratelimit_state))
return;
- dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- func, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ func, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
}
-void drbd_bm_lock(struct drbd_conf *mdev, char *why)
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
int trylock_failed;
if (!b) {
- dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
+ drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
return;
}
trylock_failed = !mutex_trylock(&b->bm_change);
if (trylock_failed) {
- dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- why, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ why, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
mutex_lock(&b->bm_change);
}
- if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
- dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+ if (BM_LOCKED_MASK & b->bm_flags)
+ drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+ b->bm_flags |= flags & BM_LOCKED_MASK;
b->bm_why = why;
b->bm_task = current;
}
-void drbd_bm_unlock(struct drbd_conf *mdev)
+void drbd_bm_unlock(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
if (!b) {
- dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
+ drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
return;
}
- if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
- dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
+ if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+ drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+ b->bm_flags &= ~BM_LOCKED_MASK;
b->bm_why = NULL;
b->bm_task = NULL;
mutex_unlock(&b->bm_change);
}
-/* word offset to long pointer */
-static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ * 1<<38 bits,
+ * 1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK 31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR 30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT 29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT 28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT 27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+ BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+ set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+ return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+ wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+ /* use cmpxchg? */
+ clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+ clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device: DRBD device.
+ * @page_nr: the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
{
struct page *page;
- unsigned long page_nr;
+ if (page_nr >= device->bitmap->bm_number_of_pages) {
+ drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+ page_nr, (int)device->bitmap->bm_number_of_pages);
+ return;
+ }
+ page = device->bitmap->bm_pages[page_nr];
+ set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+ volatile const unsigned long *addr = &page_private(page);
+ return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+ set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+ clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+ return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
- page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+ unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+ /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+ unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
BUG_ON(page_nr >= b->bm_number_of_pages);
- page = b->bm_pages[page_nr];
+ return page_nr;
+}
- return (unsigned long *) kmap_atomic(page, km);
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+ struct page *page = b->bm_pages[idx];
+ return (unsigned long *) kmap_atomic(page);
}
-static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
{
- return __bm_map_paddr(b, offset, KM_IRQ1);
+ return __bm_map_pidx(b, idx);
}
-static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+static void __bm_unmap(unsigned long *p_addr)
{
- kunmap_atomic(p_addr, km);
+ kunmap_atomic(p_addr);
};
static void bm_unmap(unsigned long *p_addr)
{
- return __bm_unmap(p_addr, KM_IRQ1);
+ return __bm_unmap(p_addr);
}
/* long word offset of _bitmap_ sector */
@@ -188,7 +330,7 @@ static void bm_unmap(unsigned long *p_addr)
/* word offset from start of bitmap to word number _in_page_
* modulo longs per page
#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
- hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
so do it explicitly:
*/
#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
@@ -198,10 +340,11 @@ static void bm_unmap(unsigned long *p_addr)
/*
* actually most functions herein should take a struct drbd_bitmap*, not a
- * struct drbd_conf*, but for the debug macros I like to have the mdev around
+ * struct drbd_device*, but for the debug macros I like to have the device around
* to be able to report device specific.
*/
+
static void bm_free_pages(struct page **pages, unsigned long number)
{
unsigned long i;
@@ -245,30 +388,34 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
return old_pages;
/* Trying kmalloc first, falling back to vmalloc.
- * GFP_KERNEL is ok, as this is done when a lower level disk is
- * "attached" to the drbd. Context is receiver thread or cqueue
- * thread. As we have no disk yet, we are not in the IO path,
- * not even the IO path of the peer. */
+ * GFP_NOIO, as this is called while drbd IO is "suspended",
+ * and during resize or attach on diskless Primary,
+ * we must not block on IO to ourselves.
+ * Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
- new_pages = kmalloc(bytes, GFP_KERNEL);
+ new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
if (!new_pages) {
- new_pages = vmalloc(bytes);
+ new_pages = __vmalloc(bytes,
+ GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
if (!new_pages)
return NULL;
vmalloced = 1;
}
- memset(new_pages, 0, bytes);
if (want >= have) {
for (i = 0; i < have; i++)
new_pages[i] = old_pages[i];
for (; i < want; i++) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
if (!page) {
bm_free_pages(new_pages + have, i - have);
bm_vk_free(new_pages, vmalloced);
return NULL;
}
+ /* we want to know which page it is
+ * from the endio handlers */
+ bm_store_page_idx(page, i);
new_pages[i] = page;
}
} else {
@@ -280,20 +427,20 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
}
if (vmalloced)
- set_bit(BM_P_VMALLOCED, &b->bm_flags);
+ b->bm_flags |= BM_P_VMALLOCED;
else
- clear_bit(BM_P_VMALLOCED, &b->bm_flags);
+ b->bm_flags &= ~BM_P_VMALLOCED;
return new_pages;
}
/*
* called on driver init only. TODO call when a device is created.
- * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ * allocates the drbd_bitmap, and stores it in device->bitmap.
*/
-int drbd_bm_init(struct drbd_conf *mdev)
+int drbd_bm_init(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
WARN_ON(b != NULL);
b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
if (!b)
@@ -302,26 +449,28 @@ int drbd_bm_init(struct drbd_conf *mdev)
mutex_init(&b->bm_change);
init_waitqueue_head(&b->bm_io_wait);
- mdev->bitmap = b;
+ device->bitmap = b;
return 0;
}
-sector_t drbd_bm_capacity(struct drbd_conf *mdev)
+sector_t drbd_bm_capacity(struct drbd_device *device)
{
- ERR_IF(!mdev->bitmap) return 0;
- return mdev->bitmap->bm_dev_capacity;
+ if (!expect(device->bitmap))
+ return 0;
+ return device->bitmap->bm_dev_capacity;
}
/* called on driver unload. TODO: call when a device is destroyed.
*/
-void drbd_bm_cleanup(struct drbd_conf *mdev)
+void drbd_bm_cleanup(struct drbd_device *device)
{
- ERR_IF (!mdev->bitmap) return;
- bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
- bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
- kfree(mdev->bitmap);
- mdev->bitmap = NULL;
+ if (!expect(device->bitmap))
+ return;
+ bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+ bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+ kfree(device->bitmap);
+ device->bitmap = NULL;
}
/*
@@ -329,22 +478,39 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
* this masks out the remaining bits.
* Returns the number of bits cleared.
*/
+#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
static int bm_clear_surplus(struct drbd_bitmap *b)
{
- const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
- size_t w = b->bm_bits >> LN2_BPL;
- int cleared = 0;
+ unsigned long mask;
unsigned long *p_addr, *bm;
+ int tmp;
+ int cleared = 0;
- p_addr = bm_map_paddr(b, w);
- bm = p_addr + MLPP(w);
- if (w < b->bm_words) {
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
cleared = hweight_long(*bm & ~mask);
*bm &= mask;
- w++; bm++;
+ bm++;
}
- if (w < b->bm_words) {
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
cleared += hweight_long(*bm);
*bm = 0;
}
@@ -354,66 +520,75 @@ static int bm_clear_surplus(struct drbd_bitmap *b)
static void bm_set_surplus(struct drbd_bitmap *b)
{
- const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
- size_t w = b->bm_bits >> LN2_BPL;
+ unsigned long mask;
unsigned long *p_addr, *bm;
-
- p_addr = bm_map_paddr(b, w);
- bm = p_addr + MLPP(w);
- if (w < b->bm_words) {
+ int tmp;
+
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
*bm |= ~mask;
- bm++; w++;
+ bm++;
}
- if (w < b->bm_words) {
- *bm = ~(0UL);
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
+ *bm = ~0UL;
}
bm_unmap(p_addr);
}
-static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
{
- unsigned long *p_addr, *bm, offset = 0;
+ unsigned long *p_addr;
unsigned long bits = 0;
- unsigned long i, do_now;
-
- while (offset < b->bm_words) {
- i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
- p_addr = __bm_map_paddr(b, offset, KM_USER0);
- bm = p_addr + MLPP(offset);
- while (i--) {
-#ifndef __LITTLE_ENDIAN
- if (swap_endian)
- *bm = lel_to_cpu(*bm);
-#endif
- bits += hweight_long(*bm++);
- }
- __bm_unmap(p_addr, KM_USER0);
- offset += do_now;
+ unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+ int idx, i, last_word;
+
+ /* all but last page */
+ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < LWPP; i++)
+ bits += hweight_long(p_addr[i]);
+ __bm_unmap(p_addr);
cond_resched();
}
-
+ /* last (or only) page */
+ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < last_word; i++)
+ bits += hweight_long(p_addr[i]);
+ p_addr[last_word] &= cpu_to_lel(mask);
+ bits += hweight_long(p_addr[last_word]);
+ /* 32bit arch, may have an unused padding long */
+ if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+ p_addr[last_word+1] = 0;
+ __bm_unmap(p_addr);
return bits;
}
-static unsigned long bm_count_bits(struct drbd_bitmap *b)
-{
- return __bm_count_bits(b, 0);
-}
-
-static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
-{
- return __bm_count_bits(b, 1);
-}
-
/* offset and len in long words.*/
static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
{
unsigned long *p_addr, *bm;
+ unsigned int idx;
size_t do_now, end;
-#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
-
end = offset + len;
if (end > b->bm_words) {
@@ -423,19 +598,31 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
- p_addr = bm_map_paddr(b, offset);
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
bm = p_addr + MLPP(offset);
if (bm+do_now > p_addr + LWPP) {
printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
p_addr, bm, (int)do_now);
- break; /* breaks to after catch_oob_access_end() only! */
- }
- memset(bm, c, do_now * sizeof(long));
+ } else
+ memset(bm, c, do_now * sizeof(long));
bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
offset += do_now;
}
}
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+ u64 bitmap_sectors;
+ if (ldev->md.al_offset == 8)
+ bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+ else
+ bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+ return bitmap_sectors << (9 + 3);
+}
+
/*
* make sure the bitmap has enough room for the attached storage,
* if necessary, resize.
@@ -444,26 +631,27 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
* In case this is actually a resize, we copy the old bitmap into the new one.
* Otherwise, the bitmap is initialized to all bits set.
*/
-int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long bits, words, owords, obits, *p_addr, *bm;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long bits, words, owords, obits;
unsigned long want, have, onpages; /* number of pages */
struct page **npages, **opages = NULL;
int err = 0, growing;
int opages_vmalloced;
- ERR_IF(!b) return -ENOMEM;
+ if (!expect(b))
+ return -ENOMEM;
- drbd_bm_lock(mdev, "resize");
+ drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
- dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
+ drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
(unsigned long long)capacity);
if (capacity == b->bm_dev_capacity)
goto out;
- opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
+ opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
if (capacity == 0) {
spin_lock_irq(&b->bm_lock);
@@ -490,19 +678,24 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
*/
words = ALIGN(bits, 64) >> LN2_BPL;
- if (get_ldev(mdev)) {
- D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
- put_ldev(mdev);
+ if (get_ldev(device)) {
+ u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+ put_ldev(device);
+ if (bits > bits_on_disk) {
+ drbd_info(device, "bits = %lu\n", bits);
+ drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+ err = -ENOSPC;
+ goto out;
+ }
}
- /* one extra long to catch off by one errors */
- want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+ want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
have = b->bm_number_of_pages;
if (want == have) {
- D_ASSERT(b->bm_pages != NULL);
+ D_ASSERT(device, b->bm_pages != NULL);
npages = b->bm_pages;
} else {
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
+ if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
npages = NULL;
else
npages = bm_realloc_pages(b, want);
@@ -542,11 +735,6 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
bm_free_pages(opages + want, have - want);
}
- p_addr = bm_map_paddr(b, words);
- bm = p_addr + MLPP(words);
- *bm = DRBD_MAGIC;
- bm_unmap(p_addr);
-
(void)bm_clear_surplus(b);
spin_unlock_irq(&b->bm_lock);
@@ -554,10 +742,10 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
bm_vk_free(opages, opages_vmalloced);
if (!growing)
b->bm_set = bm_count_bits(b);
- dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
+ drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
out:
- drbd_bm_unlock(mdev);
+ drbd_bm_unlock(device);
return err;
}
@@ -569,14 +757,16 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
*
* maybe bm_set should be atomic_t ?
*/
-unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long s;
unsigned long flags;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
@@ -585,30 +775,33 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
return s;
}
-unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
{
unsigned long s;
/* if I don't have a disk, I don't know about out-of-sync status */
- if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
return 0;
- s = _drbd_bm_total_weight(mdev);
- put_ldev(mdev);
+ s = _drbd_bm_total_weight(device);
+ put_ldev(device);
return s;
}
-size_t drbd_bm_words(struct drbd_conf *mdev)
+size_t drbd_bm_words(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
return b->bm_words;
}
-unsigned long drbd_bm_bits(struct drbd_conf *mdev)
+unsigned long drbd_bm_bits(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
return b->bm_bits;
}
@@ -618,18 +811,21 @@ unsigned long drbd_bm_bits(struct drbd_conf *mdev)
* bitmap must be locked by drbd_bm_lock.
* currently only used from receive_bitmap.
*/
-void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
unsigned long *buffer)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long *p_addr, *bm;
unsigned long word, bits;
+ unsigned int idx;
size_t end, do_now;
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
@@ -638,16 +834,18 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
spin_lock_irq(&b->bm_lock);
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
- p_addr = bm_map_paddr(b, offset);
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
bm = p_addr + MLPP(offset);
offset += do_now;
while (do_now--) {
bits = hweight_long(*bm);
- word = *bm | lel_to_cpu(*buffer++);
+ word = *bm | *buffer++;
*bm++ = word;
b->bm_set += hweight_long(word) - bits;
}
bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
}
/* with 32bit <-> 64bit cross-platform connect
* this is only correct for current usage,
@@ -656,41 +854,42 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
*/
if (end == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
-
spin_unlock_irq(&b->bm_lock);
}
/* copy number words from the bitmap starting at offset into the buffer.
* buffer[i] will be little endian unsigned long.
*/
-void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
unsigned long *buffer)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long *p_addr, *bm;
size_t end, do_now;
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
(end > b->bm_words) ||
(number <= 0))
- dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
+ drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
(unsigned long) offset,
(unsigned long) number,
(unsigned long) b->bm_words);
else {
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
- p_addr = bm_map_paddr(b, offset);
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
bm = p_addr + MLPP(offset);
offset += do_now;
while (do_now--)
- *buffer++ = cpu_to_lel(*bm++);
+ *buffer++ = *bm++;
bm_unmap(p_addr);
}
}
@@ -698,11 +897,13 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
}
/* set all bits in the bitmap */
-void drbd_bm_set_all(struct drbd_conf *mdev)
+void drbd_bm_set_all(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
@@ -712,11 +913,13 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
}
/* clear all bits in the bitmap */
-void drbd_bm_clear_all(struct drbd_conf *mdev)
+void drbd_bm_clear_all(struct drbd_device *device)
{
- struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
@@ -724,9 +927,33 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
spin_unlock_irq(&b->bm_lock);
}
+struct bm_aio_ctx {
+ struct drbd_device *device;
+ atomic_t in_flight;
+ unsigned int done;
+ unsigned flags;
+#define BM_AIO_COPY_PAGES 1
+#define BM_AIO_WRITE_HINTED 2
+#define BM_WRITE_ALL_PAGES 4
+ int error;
+ struct kref kref;
+};
+
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+ struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+ put_ldev(ctx->device);
+ kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
static void bm_async_io_complete(struct bio *bio, int error)
{
- struct drbd_bitmap *b = bio->bi_private;
+ struct bm_aio_ctx *ctx = bio->bi_private;
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -737,283 +964,416 @@ static void bm_async_io_complete(struct bio *bio, int error)
if (!error && !uptodate)
error = -EIO;
+ if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+ !bm_test_page_unchanged(b->bm_pages[idx]))
+ drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
if (error) {
- /* doh. what now?
- * for now, set all bits, and flag MD_IO_ERROR */
- __set_bit(BM_MD_IO_ERROR, &b->bm_flags);
+ /* ctx error will hold the completed-last non-zero error code,
+ * in case error codes differ. */
+ ctx->error = error;
+ bm_set_page_io_err(b->bm_pages[idx]);
+ /* Not identical to on disk version of it.
+ * Is BM_PAGE_IO_ERROR enough? */
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+ error, idx);
+ } else {
+ bm_clear_page_io_err(b->bm_pages[idx]);
+ dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
}
- if (atomic_dec_and_test(&b->bm_async_io))
- wake_up(&b->bm_io_wait);
+
+ bm_page_unlock_io(device, idx);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES)
+ mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
bio_put(bio);
+
+ if (atomic_dec_and_test(&ctx->in_flight)) {
+ ctx->done = 1;
+ wake_up(&device->misc_wait);
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ }
}
-static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
{
- /* we are process context. we always get a bio */
- struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+ struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ struct page *page;
unsigned int len;
+
sector_t on_disk_sector =
- mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
+ device->ldev->md.md_offset + device->ldev->md.bm_offset;
on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
/* this might happen with very small
- * flexible external meta data device */
+ * flexible external meta data device,
+ * or with PAGE_SIZE > 4k */
len = min_t(unsigned int, PAGE_SIZE,
- (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
-
- bio->bi_bdev = mdev->ldev->md_bdev;
- bio->bi_sector = on_disk_sector;
- bio_add_page(bio, b->bm_pages[page_nr], len, 0);
- bio->bi_private = b;
+ (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+
+ /* serialize IO on this page */
+ bm_page_lock_io(device, page_nr);
+ /* before memcpy and submit,
+ * so it can be redirtied any time */
+ bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES) {
+ page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+ copy_highpage(page, b->bm_pages[page_nr]);
+ bm_store_page_idx(page, page_nr);
+ } else
+ page = b->bm_pages[page_nr];
+ bio->bi_bdev = device->ldev->md_bdev;
+ bio->bi_iter.bi_sector = on_disk_sector;
+ /* bio_add_page of a single page to an empty bio will always succeed,
+ * according to api. Do we want to assert that? */
+ bio_add_page(bio, page, len, 0);
+ bio->bi_private = ctx;
bio->bi_end_io = bm_async_io_complete;
- if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
bio->bi_rw |= rw;
bio_endio(bio, -EIO);
} else {
submit_bio(rw, bio);
+ /* this should not count as user activity and cause the
+ * resync to throttle -- see drbd_rs_should_slow_down(). */
+ atomic_add(len >> 9, &device->rs_sect_ev);
}
}
-# if defined(__LITTLE_ENDIAN)
- /* nothing to do, on disk == in memory */
-# define bm_cpu_to_lel(x) ((void)0)
-# else
-static void bm_cpu_to_lel(struct drbd_bitmap *b)
-{
- /* need to cpu_to_lel all the pages ...
- * this may be optimized by using
- * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
- * the following is still not optimal, but better than nothing */
- unsigned int i;
- unsigned long *p_addr, *bm;
- if (b->bm_set == 0) {
- /* no page at all; avoid swap if all is 0 */
- i = b->bm_number_of_pages;
- } else if (b->bm_set == b->bm_bits) {
- /* only the last page */
- i = b->bm_number_of_pages - 1;
- } else {
- /* all pages */
- i = 0;
- }
- for (; i < b->bm_number_of_pages; i++) {
- p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
- for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
- *bm = cpu_to_lel(*bm);
- kunmap_atomic(p_addr, KM_USER0);
- }
-}
-# endif
-/* lel_to_cpu == cpu_to_lel */
-# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
-
/*
* bm_rw: read/write the whole bitmap from/to its on disk location.
*/
-static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
+static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
{
- struct drbd_bitmap *b = mdev->bitmap;
- /* sector_t sector; */
- int bm_words, num_pages, i;
+ struct bm_aio_ctx *ctx;
+ struct drbd_bitmap *b = device->bitmap;
+ int num_pages, i, count = 0;
unsigned long now;
char ppb[10];
int err = 0;
- WARN_ON(!bm_is_locked(b));
+ /*
+ * We are protected against bitmap disappearing/resizing by holding an
+ * ldev reference (caller must have called get_ldev()).
+ * For read/write, we are protected against changes to the bitmap by
+ * the bitmap lock (see drbd_bitmap_io).
+ * For lazy writeout, we don't care for ongoing changes to the bitmap,
+ * as we submit copies of pages anyways.
+ */
- /* no spinlock here, the drbd_bm_lock should be enough! */
+ ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
- bm_words = drbd_bm_words(mdev);
- num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
+ *ctx = (struct bm_aio_ctx) {
+ .device = device,
+ .in_flight = ATOMIC_INIT(1),
+ .done = 0,
+ .flags = flags,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
+ };
+
+ if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+ kfree(ctx);
+ return -ENODEV;
+ }
- /* on disk bitmap is little endian */
- if (rw == WRITE)
- bm_cpu_to_lel(b);
+ if (!ctx->flags)
+ WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+ num_pages = b->bm_number_of_pages;
now = jiffies;
- atomic_set(&b->bm_async_io, num_pages);
- __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
/* let the layers below us try to merge these bios... */
- for (i = 0; i < num_pages; i++)
- bm_page_io_async(mdev, b, i, rw);
-
- drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
- wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
+ for (i = 0; i < num_pages; i++) {
+ /* ignore completely unchanged pages */
+ if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+ break;
+ if (rw & WRITE) {
+ if ((flags & BM_AIO_WRITE_HINTED) &&
+ !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+
+ if (!(flags & BM_WRITE_ALL_PAGES) &&
+ bm_test_page_unchanged(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+ continue;
+ }
+ /* during lazy writeout,
+ * ignore those pages not marked for lazy writeout. */
+ if (lazy_writeout_upper_idx &&
+ !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+ continue;
+ }
+ }
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i, rw);
+ ++count;
+ cond_resched();
+ }
- if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
- dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
- drbd_chk_io_error(mdev, 1, TRUE);
- err = -EIO;
+ /*
+ * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+ * will not set ctx->done early, and decrement / test it here. If there
+ * are still some bios in flight, we need to wait for them here.
+ * If all IO is done already (or nothing had been submitted), there is
+ * no need to wait. Still, we need to put the kref associated with the
+ * "in_flight reached zero, all done" event.
+ */
+ if (!atomic_dec_and_test(&ctx->in_flight))
+ wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+ else
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
+ /* summary for global bitmap IO */
+ if (flags == 0)
+ drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
+ rw == WRITE ? "WRITE" : "READ",
+ count, jiffies - now);
+
+ if (ctx->error) {
+ drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ err = -EIO; /* ctx->error ? */
}
+ if (atomic_read(&ctx->in_flight))
+ err = -EIO; /* Disk timeout/force-detach during IO... */
+
now = jiffies;
if (rw == WRITE) {
- /* swap back endianness */
- bm_lel_to_cpu(b);
- /* flush bitmap to stable storage */
- drbd_md_flush(mdev);
+ drbd_md_flush(device);
} else /* rw == READ */ {
- /* just read, if necessary adjust endianness */
- b->bm_set = bm_count_bits_swap_endian(b);
- dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
+ b->bm_set = bm_count_bits(b);
+ drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
jiffies - now);
}
now = b->bm_set;
- dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
- ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+ if (flags == 0)
+ drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
}
/**
* drbd_bm_read() - Read the whole bitmap from its on disk location.
- * @mdev: DRBD device.
+ * @device: DRBD device.
*/
-int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(mdev, READ);
+ return bm_rw(device, READ, 0, 0);
}
/**
* drbd_bm_write() - Write the whole bitmap to its on disk location.
- * @mdev: DRBD device.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
*/
-int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(mdev, WRITE);
+ return bm_rw(device, WRITE, 0, 0);
}
/**
- * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
- * @mdev: DRBD device.
- * @enr: Extent number in the resync lru (happens to be sector offset)
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
*
- * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
- * by a single sector write. Therefore enr == sector offset from the
- * start of the bitmap.
+ * Will write all pages.
*/
-int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
{
- sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
- + mdev->ldev->md.bm_offset;
- int bm_words, num_words, offset;
- int err = 0;
+ return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device: DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
+ * @device: DRBD device.
+ * @idx: bitmap page index
+ *
+ * We don't want to special case on logical_block_size of the backend device,
+ * so we submit PAGE_SIZE aligned pieces.
+ * Note that on "most" systems, PAGE_SIZE is 4k.
+ *
+ * In case this becomes an issue on systems with larger PAGE_SIZE,
+ * we may want to change this again to write 4k aligned 4k pieces.
+ */
+int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
+{
+ struct bm_aio_ctx *ctx;
+ int err;
+
+ if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
+ dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
+ return 0;
+ }
+
+ ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
- mutex_lock(&mdev->md_io_mutex);
- bm_words = drbd_bm_words(mdev);
- offset = S2W(enr); /* word offset into bitmap */
- num_words = min(S2W(1), bm_words - offset);
- if (num_words < S2W(1))
- memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
- drbd_bm_get_lel(mdev, offset, num_words,
- page_address(mdev->md_io_page));
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
- int i;
- err = -EIO;
- dev_err(DEV, "IO ERROR writing bitmap sector %lu "
- "(meta-disk sector %llus)\n",
- enr, (unsigned long long)on_disk_sector);
- drbd_chk_io_error(mdev, 1, TRUE);
- for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
- drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
+ *ctx = (struct bm_aio_ctx) {
+ .device = device,
+ .in_flight = ATOMIC_INIT(1),
+ .done = 0,
+ .flags = BM_AIO_COPY_PAGES,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
+ };
+
+ if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+ kfree(ctx);
+ return -ENODEV;
}
- mdev->bm_writ_cnt++;
- mutex_unlock(&mdev->md_io_mutex);
+
+ bm_page_io_async(ctx, idx, WRITE_SYNC);
+ wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+
+ if (ctx->error)
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ /* that causes us to detach, so the in memory bitmap will be
+ * gone in a moment as well. */
+
+ device->bm_writ_cnt++;
+ err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+ kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
}
/* NOTE
* find_first_bit returns int, we return unsigned long.
- * should not make much difference anyways, but ...
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
*
* this returns a bit number, NOT a sector!
*/
-#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
-static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
- const int find_zero_bit, const enum km_type km)
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+ const int find_zero_bit)
{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long i = -1UL;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long *p_addr;
- unsigned long bit_offset; /* bit offset of the mapped page. */
+ unsigned long bit_offset;
+ unsigned i;
+
if (bm_fo > b->bm_bits) {
- dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+ drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+ bm_fo = DRBD_END_OF_BITMAP;
} else {
while (bm_fo < b->bm_bits) {
- unsigned long offset;
- bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
- offset = bit_offset >> LN2_BPL; /* word offset of the page */
- p_addr = __bm_map_paddr(b, offset, km);
+ /* bit offset of the first bit in the page */
+ bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+ p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
if (find_zero_bit)
- i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+ i = find_next_zero_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
else
- i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+ i = find_next_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
- __bm_unmap(p_addr, km);
+ __bm_unmap(p_addr);
if (i < PAGE_SIZE*8) {
- i = bit_offset + i;
- if (i >= b->bm_bits)
+ bm_fo = bit_offset + i;
+ if (bm_fo >= b->bm_bits)
break;
goto found;
}
bm_fo = bit_offset + PAGE_SIZE*8;
}
- i = -1UL;
+ bm_fo = DRBD_END_OF_BITMAP;
}
found:
- return i;
+ return bm_fo;
}
-static unsigned long bm_find_next(struct drbd_conf *mdev,
+static unsigned long bm_find_next(struct drbd_device *device,
unsigned long bm_fo, const int find_zero_bit)
{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long i = -1UL;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long i = DRBD_END_OF_BITMAP;
- ERR_IF(!b) return i;
- ERR_IF(!b->bm_pages) return i;
+ if (!expect(b))
+ return i;
+ if (!expect(b->bm_pages))
+ return i;
spin_lock_irq(&b->bm_lock);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
- i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+ i = __bm_find_next(device, bm_fo, find_zero_bit);
spin_unlock_irq(&b->bm_lock);
return i;
}
-unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
{
- return bm_find_next(mdev, bm_fo, 0);
+ return bm_find_next(device, bm_fo, 0);
}
#if 0
/* not yet needed for anything. */
-unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
{
- return bm_find_next(mdev, bm_fo, 1);
+ return bm_find_next(device, bm_fo, 1);
}
#endif
/* does not spin_lock_irqsave.
* you must take drbd_bm_lock() first */
-unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
{
- /* WARN_ON(!bm_is_locked(mdev)); */
- return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 0);
}
-unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
{
- /* WARN_ON(!bm_is_locked(mdev)); */
- return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 1);
}
/* returns number of bits actually changed.
@@ -1022,74 +1382,87 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
* wants bitnr, not sector.
* expected to be called for only a few bits (e - s about BITS_PER_LONG).
* Must hold bitmap lock already. */
-static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
- unsigned long e, int val, const enum km_type km)
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+ unsigned long e, int val)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long *p_addr = NULL;
unsigned long bitnr;
- unsigned long last_page_nr = -1UL;
+ unsigned int last_page_nr = -1U;
int c = 0;
+ int changed_total = 0;
if (e >= b->bm_bits) {
- dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+ drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
s, e, b->bm_bits);
e = b->bm_bits ? b->bm_bits -1 : 0;
}
for (bitnr = s; bitnr <= e; bitnr++) {
- unsigned long offset = bitnr>>LN2_BPL;
- unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+ unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
if (page_nr != last_page_nr) {
if (p_addr)
- __bm_unmap(p_addr, km);
- p_addr = __bm_map_paddr(b, offset, km);
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ c = 0;
+ p_addr = __bm_map_pidx(b, page_nr);
last_page_nr = page_nr;
}
if (val)
- c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
+ c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
else
- c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
+ c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
}
if (p_addr)
- __bm_unmap(p_addr, km);
- b->bm_set += c;
- return c;
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ b->bm_set += changed_total;
+ return changed_total;
}
/* returns number of bits actually changed.
* for val != 0, we change 0 -> 1, return code positive
* for val == 0, we change 1 -> 0, return code negative
* wants bitnr, not sector */
-static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
const unsigned long e, int val)
{
unsigned long flags;
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
int c = 0;
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
+ if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+ bm_print_lock_info(device);
- c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
+ c = __bm_change_bits_to(device, s, e, val);
spin_unlock_irqrestore(&b->bm_lock, flags);
return c;
}
/* returns number of bits changed 0 -> 1 */
-int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
{
- return bm_change_bits_to(mdev, s, e, 1);
+ return bm_change_bits_to(device, s, e, 1);
}
/* returns number of bits changed 1 -> 0 */
-int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
{
- return -bm_change_bits_to(mdev, s, e, 0);
+ return -bm_change_bits_to(device, s, e, 0);
}
/* sets all bits in full words,
@@ -1099,20 +1472,29 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
- unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
+ int changed = 0;
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
- b->bm_set += BITS_PER_LONG - bits;
+ changed += BITS_PER_LONG - bits;
+ }
+ kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
}
- kunmap_atomic(paddr, KM_USER0);
}
-/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
* You must first drbd_bm_lock().
* Can be called to set the whole bitmap in one go.
* Sets bits from s to e _inclusive_. */
-void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
{
/* First set_bit from the first bit (s)
* up to the next long boundary (sl),
@@ -1122,6 +1504,7 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* Do not use memset, because we must account for changes,
* so we need to loop over the words with hweight() anyways.
*/
+ struct drbd_bitmap *b = device->bitmap;
unsigned long sl = ALIGN(s,BITS_PER_LONG);
unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
int first_page;
@@ -1132,15 +1515,19 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
if (e - s <= 3*BITS_PER_LONG) {
/* don't bother; el and sl may even be wrong. */
- __bm_change_bits_to(mdev, s, e, 1, KM_USER0);
+ spin_lock_irq(&b->bm_lock);
+ __bm_change_bits_to(device, s, e, 1);
+ spin_unlock_irq(&b->bm_lock);
return;
}
/* difference is large enough that we can trust sl and el */
+ spin_lock_irq(&b->bm_lock);
+
/* bits filling the current long */
if (sl)
- __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
+ __bm_change_bits_to(device, s, sl-1, 1);
first_page = sl >> (3 + PAGE_SHIFT);
last_page = el >> (3 + PAGE_SHIFT);
@@ -1152,14 +1539,23 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
/* first and full pages, unless first page == last page */
for (page_nr = first_page; page_nr < last_page; page_nr++) {
- bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
+ bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+ spin_unlock_irq(&b->bm_lock);
cond_resched();
first_word = 0;
+ spin_lock_irq(&b->bm_lock);
}
-
/* last page (respectively only page, for first page == last page) */
last_word = MLPP(el >> LN2_BPL);
- bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+ /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+ * ==> e = 32767, el = 32768, last_page = 2,
+ * and now last_word = 0.
+ * We do not want to touch last_page in this case,
+ * as we did not allocate it, it is not present in bitmap->bm_pages.
+ */
+ if (last_word)
+ bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
/* possibly trailing bits.
* example: (e & 63) == 63, el will be e+1.
@@ -1167,7 +1563,8 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* it would trigger an assert in __bm_change_bits_to()
*/
if (el <= e)
- __bm_change_bits_to(mdev, el, e, 1, KM_USER0);
+ __bm_change_bits_to(device, el, e, 1);
+ spin_unlock_irq(&b->bm_lock);
}
/* returns bit state
@@ -1177,28 +1574,29 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* 0 ... bit not set
* -1 ... first out of bounds access, stop testing for bits!
*/
-int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
{
unsigned long flags;
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
unsigned long *p_addr;
int i;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
if (bitnr < b->bm_bits) {
- unsigned long offset = bitnr>>LN2_BPL;
- p_addr = bm_map_paddr(b, offset);
- i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+ p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+ i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
bm_unmap(p_addr);
} else if (bitnr == b->bm_bits) {
i = -1;
} else { /* (bitnr > b->bm_bits) */
- dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+ drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
i = 0;
}
@@ -1207,38 +1605,39 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
}
/* returns number of bits set in the range [s, e] */
-int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
{
unsigned long flags;
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long *p_addr = NULL, page_nr = -1;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr = NULL;
unsigned long bitnr;
+ unsigned int page_nr = -1U;
int c = 0;
- size_t w;
/* If this is called without a bitmap, that is a bug. But just to be
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 1;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 1;
spin_lock_irqsave(&b->bm_lock, flags);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
for (bitnr = s; bitnr <= e; bitnr++) {
- w = bitnr >> LN2_BPL;
- if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
- page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+ unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+ if (page_nr != idx) {
+ page_nr = idx;
if (p_addr)
bm_unmap(p_addr);
- p_addr = bm_map_paddr(b, w);
- }
- ERR_IF (bitnr >= b->bm_bits) {
- dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
- } else {
- c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+ p_addr = bm_map_pidx(b, idx);
}
+ if (expect(bitnr < b->bm_bits))
+ c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+ else
+ drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
@@ -1261,75 +1660,35 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* reference count of some bitmap extent element from some lru instead...
*
*/
-int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
{
- struct drbd_bitmap *b = mdev->bitmap;
+ struct drbd_bitmap *b = device->bitmap;
int count, s, e;
unsigned long flags;
unsigned long *p_addr, *bm;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
s = S2W(enr);
e = min((size_t)S2W(enr+1), b->bm_words);
count = 0;
if (s < b->bm_words) {
int n = e-s;
- p_addr = bm_map_paddr(b, s);
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
bm = p_addr + MLPP(s);
while (n--)
count += hweight_long(*bm++);
bm_unmap(p_addr);
} else {
- dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+ drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
}
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
-
-/* set all bits covered by the AL-extent al_enr */
-unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
-{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long *p_addr, *bm;
- unsigned long weight;
- int count, s, e, i, do_now;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
-
- spin_lock_irq(&b->bm_lock);
- if (bm_is_locked(b))
- bm_print_lock_info(mdev);
- weight = b->bm_set;
-
- s = al_enr * BM_WORDS_PER_AL_EXT;
- e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
- /* assert that s and e are on the same page */
- D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
- == s >> (PAGE_SHIFT - LN2_BPL + 3));
- count = 0;
- if (s < b->bm_words) {
- i = do_now = e-s;
- p_addr = bm_map_paddr(b, s);
- bm = p_addr + MLPP(s);
- while (i--) {
- count += hweight_long(*bm);
- *bm = -1UL;
- bm++;
- }
- bm_unmap(p_addr);
- b->bm_set += do_now*BITS_PER_LONG - count;
- if (e == b->bm_words)
- b->bm_set -= bm_clear_surplus(b);
- } else {
- dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
- }
- weight = b->bm_set - weight;
- spin_unlock_irq(&b->bm_lock);
- return weight;
-}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 1ea1a34e78b..a76ceb344d6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -28,7 +28,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/bitops.h>
@@ -40,8 +39,15 @@
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
+#include <linux/idr.h>
#include <net/tcp.h>
#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@@ -59,9 +65,9 @@
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
-extern unsigned int cn_idx;
+extern bool disable_sendpage;
+extern bool allow_oos;
+void tl_abort_disk_io(struct drbd_device *device);
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
@@ -72,13 +78,6 @@ extern int fault_devs;
extern char usermode_helper[];
-#ifndef TRUE
-#define TRUE 1
-#endif
-#ifndef FALSE
-#define FALSE 0
-#endif
-
/* I don't remember why XCPU ...
* This is used to wake the asender,
* and to interrupt sending the sending task
@@ -93,33 +92,79 @@ extern char usermode_helper[];
*/
#define DRBD_SIGKILL SIGHUP
-/* All EEs on the free list should have ID_VACANT (== 0)
- * freshly allocated EEs get !ID_VACANT (== 1)
- * so if it says "cannot dereference null pointer at address 0x00000001",
- * it is most likely one of these :( */
-
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
-
#define ID_SYNCER (-1ULL)
-#define ID_VACANT 0
-#define is_syncer_block_id(id) ((id) == ID_SYNCER)
-
-struct drbd_conf;
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+ dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+ dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+ printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+ printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+ (__builtin_types_compatible_p(typeof(obj), type) || \
+ __builtin_types_compatible_p(typeof(obj), const type)), \
+ func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_device *, \
+ __drbd_printk_device, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+ __drbd_printk_resource, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+ __drbd_printk_connection, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+ __drbd_printk_peer_device, level, fmt, ## args), \
+ drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+ drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+ drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+ drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+ drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+ drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+ drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+ dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp) do { \
+ if (!(exp)) \
+ drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+ } while (0)
-/* to shorten dev_warn(DEV, "msg"); and relatives statements */
-#define DEV (disk_to_dev(mdev->vdisk))
-
-#define D_ASSERT(exp) if (!(exp)) \
- dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
-
-#define ERR_IF(exp) if (({ \
- int _b = (exp) != 0; \
- if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
- __func__, #exp, __FILE__, __LINE__); \
- _b; \
- }))
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ drbd_err(device, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
/* Defines to control fault insertion */
enum {
@@ -137,151 +182,30 @@ enum {
DRBD_FAULT_MAX,
};
-#ifdef CONFIG_DRBD_FAULT_INJECTION
extern unsigned int
-_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
static inline int
-drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
return fault_rate &&
(enable_faults & (1<<type)) &&
- _drbd_insert_fault(mdev, type);
-}
-#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
-
+ _drbd_insert_fault(device, type);
#else
-#define FAULT_ACTIVE(_m, _t) (0)
+ return 0;
#endif
+}
/* integer division, round _UP_ to the next integer */
#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
/* usual integer division */
#define div_floor(A, B) ((A)/(B))
-/* drbd_meta-data.c (still in drbd_main.c) */
-/* 4th incarnation of the disk layout. */
-#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
-
-extern struct drbd_conf **minor_table;
extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
-/* on the wire */
-enum drbd_packets {
- /* receiver (data socket) */
- P_DATA = 0x00,
- P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
- P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
- P_BARRIER = 0x03,
- P_BITMAP = 0x04,
- P_BECOME_SYNC_TARGET = 0x05,
- P_BECOME_SYNC_SOURCE = 0x06,
- P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
- P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
- P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
- P_SYNC_PARAM = 0x0a,
- P_PROTOCOL = 0x0b,
- P_UUIDS = 0x0c,
- P_SIZES = 0x0d,
- P_STATE = 0x0e,
- P_SYNC_UUID = 0x0f,
- P_AUTH_CHALLENGE = 0x10,
- P_AUTH_RESPONSE = 0x11,
- P_STATE_CHG_REQ = 0x12,
-
- /* asender (meta socket */
- P_PING = 0x13,
- P_PING_ACK = 0x14,
- P_RECV_ACK = 0x15, /* Used in protocol B */
- P_WRITE_ACK = 0x16, /* Used in protocol C */
- P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
- P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
- P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
- P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
- P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
- P_BARRIER_ACK = 0x1c,
- P_STATE_CHG_REPLY = 0x1d,
-
- /* "new" commands, no longer fitting into the ordering scheme above */
-
- P_OV_REQUEST = 0x1e, /* data socket */
- P_OV_REPLY = 0x1f,
- P_OV_RESULT = 0x20, /* meta socket */
- P_CSUM_RS_REQUEST = 0x21, /* data socket */
- P_RS_IS_IN_SYNC = 0x22, /* meta socket */
- P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
- P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
- /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
- /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
- P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
-
- P_MAX_CMD = 0x28,
- P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
- P_MAX_OPT_CMD = 0x101,
-
- /* special command ids for handshake */
-
- P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
- P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
-
- P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
-};
-
-static inline const char *cmdname(enum drbd_packets cmd)
-{
- /* THINK may need to become several global tables
- * when we want to support more than
- * one PRO_VERSION */
- static const char *cmdnames[] = {
- [P_DATA] = "Data",
- [P_DATA_REPLY] = "DataReply",
- [P_RS_DATA_REPLY] = "RSDataReply",
- [P_BARRIER] = "Barrier",
- [P_BITMAP] = "ReportBitMap",
- [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
- [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
- [P_UNPLUG_REMOTE] = "UnplugRemote",
- [P_DATA_REQUEST] = "DataRequest",
- [P_RS_DATA_REQUEST] = "RSDataRequest",
- [P_SYNC_PARAM] = "SyncParam",
- [P_SYNC_PARAM89] = "SyncParam89",
- [P_PROTOCOL] = "ReportProtocol",
- [P_UUIDS] = "ReportUUIDs",
- [P_SIZES] = "ReportSizes",
- [P_STATE] = "ReportState",
- [P_SYNC_UUID] = "ReportSyncUUID",
- [P_AUTH_CHALLENGE] = "AuthChallenge",
- [P_AUTH_RESPONSE] = "AuthResponse",
- [P_PING] = "Ping",
- [P_PING_ACK] = "PingAck",
- [P_RECV_ACK] = "RecvAck",
- [P_WRITE_ACK] = "WriteAck",
- [P_RS_WRITE_ACK] = "RSWriteAck",
- [P_DISCARD_ACK] = "DiscardAck",
- [P_NEG_ACK] = "NegAck",
- [P_NEG_DREPLY] = "NegDReply",
- [P_NEG_RS_DREPLY] = "NegRSDReply",
- [P_BARRIER_ACK] = "BarrierAck",
- [P_STATE_CHG_REQ] = "StateChgRequest",
- [P_STATE_CHG_REPLY] = "StateChgReply",
- [P_OV_REQUEST] = "OVRequest",
- [P_OV_REPLY] = "OVReply",
- [P_OV_RESULT] = "OVResult",
- [P_CSUM_RS_REQUEST] = "CsumRSRequest",
- [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
- [P_COMPRESSED_BITMAP] = "CBitmap",
- [P_DELAY_PROBE] = "DelayProbe",
- [P_MAX_CMD] = NULL,
- };
-
- if (cmd == P_HAND_SHAKE_M)
- return "HandShakeM";
- if (cmd == P_HAND_SHAKE_S)
- return "HandShakeS";
- if (cmd == P_HAND_SHAKE)
- return "HandShake";
- if (cmd >= P_MAX_CMD)
- return "Unknown";
- return cmdnames[cmd];
-}
+extern const char *cmdname(enum drbd_packet cmd);
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */
@@ -301,7 +225,7 @@ struct bm_xfer_ctx {
unsigned bytes[2];
};
-extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
const char *direction, struct bm_xfer_ctx *c);
static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
@@ -323,338 +247,14 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
#endif
}
-#ifndef __packed
-#define __packed __attribute__((packed))
-#endif
-
-/* This is the layout for a packet on the wire.
- * The byteorder is the network byte order.
- * (except block_id and barrier fields.
- * these are pointers to local structs
- * and have no relevance for the partner,
- * which just echoes them as received.)
- *
- * NOTE that the payload starts at a long aligned offset,
- * regardless of 32 or 64 bit arch!
- */
-struct p_header80 {
- u32 magic;
- u16 command;
- u16 length; /* bytes of data after this header */
- u8 payload[0];
-} __packed;
-
-/* Header for big packets, Used for data packets exceeding 64kB */
-struct p_header95 {
- u16 magic; /* use DRBD_MAGIC_BIG here */
- u16 command;
- u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
- u8 payload[0];
-} __packed;
-
-union p_header {
- struct p_header80 h80;
- struct p_header95 h95;
-};
-
-/*
- * short commands, packets without payload, plain p_header:
- * P_PING
- * P_PING_ACK
- * P_BECOME_SYNC_TARGET
- * P_BECOME_SYNC_SOURCE
- * P_UNPLUG_REMOTE
- */
-
-/*
- * commands with out-of-struct payload:
- * P_BITMAP (no additional fields)
- * P_DATA, P_DATA_REPLY (see p_data)
- * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
- */
-
-/* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER 1 /* depricated */
-#define DP_RW_SYNC 2 /* equals REQ_SYNC */
-#define DP_MAY_SET_IN_SYNC 4
-#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
-#define DP_FUA 16 /* equals REQ_FUA */
-#define DP_FLUSH 32 /* equals REQ_FLUSH */
-#define DP_DISCARD 64 /* equals REQ_DISCARD */
-
-struct p_data {
- union p_header head;
- u64 sector; /* 64 bits sector number */
- u64 block_id; /* to identify the request in protocol B&C */
- u32 seq_num;
- u32 dp_flags;
-} __packed;
-
-/*
- * commands which share a struct:
- * p_block_ack:
- * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
- * P_DISCARD_ACK (proto C, two-primaries conflict detection)
- * p_block_req:
- * P_DATA_REQUEST, P_RS_DATA_REQUEST
- */
-struct p_block_ack {
- struct p_header80 head;
- u64 sector;
- u64 block_id;
- u32 blksize;
- u32 seq_num;
-} __packed;
-
-
-struct p_block_req {
- struct p_header80 head;
- u64 sector;
- u64 block_id;
- u32 blksize;
- u32 pad; /* to multiple of 8 Byte */
-} __packed;
-
-/*
- * commands with their own struct for additional fields:
- * P_HAND_SHAKE
- * P_BARRIER
- * P_BARRIER_ACK
- * P_SYNC_PARAM
- * ReportParams
- */
-
-struct p_handshake {
- struct p_header80 head; /* 8 bytes */
- u32 protocol_min;
- u32 feature_flags;
- u32 protocol_max;
-
- /* should be more than enough for future enhancements
- * for now, feature_flags and the reserverd array shall be zero.
- */
-
- u32 _pad;
- u64 reserverd[7];
-} __packed;
-/* 80 bytes, FIXED for the next century */
-
-struct p_barrier {
- struct p_header80 head;
- u32 barrier; /* barrier number _handle_ only */
- u32 pad; /* to multiple of 8 Byte */
-} __packed;
-
-struct p_barrier_ack {
- struct p_header80 head;
- u32 barrier;
- u32 set_size;
-} __packed;
-
-struct p_rs_param {
- struct p_header80 head;
- u32 rate;
-
- /* Since protocol version 88 and higher. */
- char verify_alg[0];
-} __packed;
-
-struct p_rs_param_89 {
- struct p_header80 head;
- u32 rate;
- /* protocol version 89: */
- char verify_alg[SHARED_SECRET_MAX];
- char csums_alg[SHARED_SECRET_MAX];
-} __packed;
-
-struct p_rs_param_95 {
- struct p_header80 head;
- u32 rate;
- char verify_alg[SHARED_SECRET_MAX];
- char csums_alg[SHARED_SECRET_MAX];
- u32 c_plan_ahead;
- u32 c_delay_target;
- u32 c_fill_target;
- u32 c_max_rate;
-} __packed;
-
-enum drbd_conn_flags {
- CF_WANT_LOSE = 1,
- CF_DRY_RUN = 2,
-};
-
-struct p_protocol {
- struct p_header80 head;
- u32 protocol;
- u32 after_sb_0p;
- u32 after_sb_1p;
- u32 after_sb_2p;
- u32 conn_flags;
- u32 two_primaries;
-
- /* Since protocol version 87 and higher. */
- char integrity_alg[0];
-
-} __packed;
-
-struct p_uuids {
- struct p_header80 head;
- u64 uuid[UI_EXTENDED_SIZE];
-} __packed;
-
-struct p_rs_uuid {
- struct p_header80 head;
- u64 uuid;
-} __packed;
-
-struct p_sizes {
- struct p_header80 head;
- u64 d_size; /* size of disk */
- u64 u_size; /* user requested size */
- u64 c_size; /* current exported size */
- u32 max_segment_size; /* Maximal size of a BIO */
- u16 queue_order_type; /* not yet implemented in DRBD*/
- u16 dds_flags; /* use enum dds_flags here. */
-} __packed;
-
-struct p_state {
- struct p_header80 head;
- u32 state;
-} __packed;
-
-struct p_req_state {
- struct p_header80 head;
- u32 mask;
- u32 val;
-} __packed;
-
-struct p_req_state_reply {
- struct p_header80 head;
- u32 retcode;
-} __packed;
-
-struct p_drbd06_param {
- u64 size;
- u32 state;
- u32 blksize;
- u32 protocol;
- u32 version;
- u32 gen_cnt[5];
- u32 bit_map_gen[5];
-} __packed;
-
-struct p_discard {
- struct p_header80 head;
- u64 block_id;
- u32 seq_num;
- u32 pad;
-} __packed;
-
-/* Valid values for the encoding field.
- * Bump proto version when changing this. */
-enum drbd_bitmap_code {
- /* RLE_VLI_Bytes = 0,
- * and other bit variants had been defined during
- * algorithm evaluation. */
- RLE_VLI_Bits = 2,
-};
-
-struct p_compressed_bm {
- struct p_header80 head;
- /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
- * (encoding & 0x80): polarity (set/unset) of first runlength
- * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
- * used to pad up to head.length bytes
- */
- u8 encoding;
-
- u8 code[0];
-} __packed;
-
-struct p_delay_probe93 {
- struct p_header80 head;
- u32 seq_num; /* sequence number to match the two probe packets */
- u32 offset; /* usecs the probe got sent after the reference time point */
-} __packed;
-
-/* DCBP: Drbd Compressed Bitmap Packet ... */
-static inline enum drbd_bitmap_code
-DCBP_get_code(struct p_compressed_bm *p)
-{
- return (enum drbd_bitmap_code)(p->encoding & 0x0f);
-}
-
-static inline void
-DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
-{
- BUG_ON(code & ~0xf);
- p->encoding = (p->encoding & ~0xf) | code;
-}
-
-static inline int
-DCBP_get_start(struct p_compressed_bm *p)
-{
- return (p->encoding & 0x80) != 0;
-}
-
-static inline void
-DCBP_set_start(struct p_compressed_bm *p, int set)
-{
- p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
-}
-
-static inline int
-DCBP_get_pad_bits(struct p_compressed_bm *p)
-{
- return (p->encoding >> 4) & 0x7;
-}
-
-static inline void
-DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
-{
- BUG_ON(n & ~0x7);
- p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
-}
-
-/* one bitmap packet, including the p_header,
- * should fit within one _architecture independend_ page.
- * so we need to use the fixed size 4KiB page size
- * most architechtures have used for a long time.
- */
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
-#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
-#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
-#if (PAGE_SIZE < 4096)
-/* drbd_send_bitmap / receive_bitmap would break horribly */
-#error "PAGE_SIZE too small"
-#endif
-
-union p_polymorph {
- union p_header header;
- struct p_handshake handshake;
- struct p_data data;
- struct p_block_ack block_ack;
- struct p_barrier barrier;
- struct p_barrier_ack barrier_ack;
- struct p_rs_param_89 rs_param_89;
- struct p_rs_param_95 rs_param_95;
- struct p_protocol protocol;
- struct p_sizes sizes;
- struct p_uuids uuids;
- struct p_state state;
- struct p_req_state req_state;
- struct p_req_state_reply req_state_reply;
- struct p_block_req block_req;
- struct p_delay_probe93 delay_probe93;
- struct p_rs_uuid rs_uuid;
-} __packed;
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
/**********************************************************************/
enum drbd_thread_state {
- None,
- Running,
- Exiting,
- Restarting
+ NONE,
+ RUNNING,
+ EXITING,
+ RESTARTING
};
struct drbd_thread {
@@ -663,8 +263,10 @@ struct drbd_thread {
struct completion stop;
enum drbd_thread_state t_state;
int (*function) (struct drbd_thread *);
- struct drbd_conf *mdev;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
int reset_cpu_mask;
+ const char *name;
};
static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
@@ -677,69 +279,56 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
return thi->t_state;
}
-
-/*
- * Having this as the first member of a struct provides sort of "inheritance".
- * "derived" structs can be "drbd_queue_work()"ed.
- * The callback should know and cast back to the descendant struct.
- * drbd_request and drbd_epoch_entry are descendants of drbd_work.
- */
-struct drbd_work;
-typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
struct drbd_work {
struct list_head list;
- drbd_work_cb cb;
+ int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+ struct drbd_work w;
+ struct drbd_device *device;
};
-struct drbd_tl_epoch;
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
struct drbd_request {
struct drbd_work w;
- struct drbd_conf *mdev;
+ struct drbd_device *device;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
- * see drbd_endio_pri(). */
+ * see drbd_request_endio(). */
struct bio *private_bio;
- struct hlist_node colision;
- sector_t sector;
- unsigned int size;
- unsigned int epoch; /* barrier_nr */
+ struct drbd_interval i;
- /* barrier_nr: used to check on "completion" whether this req was in
+ /* epoch: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
- * starting a new epoch...
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
*/
-
- /* up to here, the struct layout is identical to drbd_epoch_entry;
- * we might be able to use that to our advantage... */
+ unsigned int epoch;
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
- unsigned long rq_state; /* see comments above _req_mod() */
- int seq_num;
unsigned long start_time;
-};
-struct drbd_tl_epoch {
- struct drbd_work w;
- struct list_head requests; /* requests before */
- struct drbd_tl_epoch *next; /* pointer to the next barrier */
- unsigned int br_number; /* the barriers identifier. */
- int n_writes; /* number of requests attached before this barrier */
-};
-
-struct drbd_request;
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
-/* These Tl_epoch_entries may be in one of 6 lists:
- active_ee .. data packet being written
- sync_ee .. syncer block being written
- done_ee .. block written, need to send P_WRITE_ACK
- read_ee .. [RS]P_DATA_REQUEST being read
-*/
+ unsigned rq_state; /* see comments above _req_mod() */
+};
struct drbd_epoch {
+ struct drbd_connection *connection;
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
@@ -747,6 +336,10 @@ struct drbd_epoch {
unsigned long flags;
};
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
/* drbd_epoch flag bits */
enum {
DE_HAVE_BARRIER_NUMBER,
@@ -759,27 +352,20 @@ enum epoch_event {
EV_CLEANUP = 32, /* used as flag */
};
-struct drbd_wq_barrier {
- struct drbd_work w;
- struct completion done;
-};
-
struct digest_info {
int digest_size;
void *digest;
};
-struct drbd_epoch_entry {
+struct drbd_peer_request {
struct drbd_work w;
- struct hlist_node colision;
+ struct drbd_peer_device *peer_device;
struct drbd_epoch *epoch; /* for writes */
- struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
- unsigned int size;
+ struct drbd_interval i;
/* see comments on ee flag bits below */
unsigned long flags;
- sector_t sector;
union {
u64 block_id;
struct digest_info *digest;
@@ -796,42 +382,54 @@ enum {
__EE_CALL_AL_COMPLETE_IO,
__EE_MAY_SET_IN_SYNC,
+ /* is this a TRIM aka REQ_DISCARD? */
+ __EE_IS_TRIM,
+ /* our lower level cannot handle trim,
+ * and we want to fall back to zeroout instead */
+ __EE_IS_TRIM_USE_ZEROOUT,
+
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
- /* we may have several bios per epoch entry.
+ /* we may have several bios per peer request.
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
+
+ /* Conflicting local requests need to be restarted after this request */
+ __EE_RESTART_REQUESTS,
+
+ /* The peer wants a write ACK for this (wire proto C) */
+ __EE_SEND_WRITE_ACK,
+
+ /* Is set when net_conf had two_primaries set while creating this peer_req */
+ __EE_IN_INTERVAL_TREE,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
-#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
+#define EE_IS_TRIM (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
-/* global flag bits */
+/* flag bits per device */
enum {
- CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
- SIGNAL_ASENDER, /* whether asender wants to be interrupted */
- SEND_PING, /* whether asender should send a ping asap */
-
- UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
- DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
- CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
* Gets cleared when the state.conn
* goes into C_CONNECTED state. */
- WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
CONSIDER_RESYNC,
MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -840,95 +438,109 @@ enum {
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
- WAS_IO_ERROR, /* Local disk failed returned IO error */
+ WAS_IO_ERROR, /* Local disk failed, returned IO error */
+ WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
+ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
- NET_CONGESTED, /* The data socket is congested */
-
- CONFIG_PENDING, /* serialization of (re)configuration requests.
- * if set, also prevents the device from dying */
- DEVICE_DYING, /* device became unconfigured,
- * but worker thread is still handling the cleanup.
- * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
- * while this is set. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
- CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
- GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
+ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ READ_BALANCE_RR,
};
-struct drbd_bitmap; /* opaque for drbd_conf */
+struct drbd_bitmap; /* opaque for drbd_device */
-/* TODO sort members for performance
- * MAYBE group them further */
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+ /* do we need to kfree, or vfree bm_pages? */
+ BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+ /* currently locked for bulk operation */
+ BM_LOCKED_MASK = 0xf,
+
+ /* in detail, that is: */
+ BM_DONT_CLEAR = 0x1,
+ BM_DONT_SET = 0x2,
+ BM_DONT_TEST = 0x4,
+
+ /* so we can mark it locked for bulk operation,
+ * and still allow all non-bulk operations */
+ BM_IS_LOCKED = 0x8,
+
+ /* (test bit, count bit) allowed (common case) */
+ BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+ /* testing bits, as well as setting new bits allowed, but clearing bits
+ * would be unexpected. Used during bitmap receive. Setting new bits
+ * requires sending of "out-of-sync" information, though. */
+ BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+ /* for drbd_bm_write_copy_pages, everything is allowed,
+ * only concurrent bulk operations are locked out. */
+ BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
-/* THINK maybe we actually want to use the default "event/%s" worker threads
- * or similar in linux 2.6, which uses per cpu data and threads.
- */
struct drbd_work_queue {
struct list_head q;
- struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
};
struct drbd_socket {
- struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
* send/receive buffers off the stack */
- union p_polymorph sbuf;
- union p_polymorph rbuf;
+ void *sbuf;
+ void *rbuf;
};
struct drbd_md {
u64 md_offset; /* sector offset to 'super' block */
u64 la_size_sect; /* last agreed size, unit sectors */
+ spinlock_t uuid_lock;
u64 uuid[UI_SIZE];
u64 device_uuid;
u32 flags;
u32 md_size_sect;
- s32 al_offset; /* signed relative sector offset to al area */
+ s32 al_offset; /* signed relative sector offset to activity log */
s32 bm_offset; /* signed relative sector offset to bitmap */
- /* u32 al_nr_extents; important for restoring the AL
- * is stored into sync_conf.al_extents, which in turn
- * gets applied to act_log->nr_elements
- */
-};
+ /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ s32 meta_dev_idx;
-/* for sync_conf and other types... */
-#define NL_PACKET(name, number, fields) struct name { fields };
-#define NL_INTEGER(pn,pr,member) int member;
-#define NL_INT64(pn,pr,member) __u64 member;
-#define NL_BIT(pn,pr,member) unsigned member:1;
-#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include "linux/drbd_nl.h"
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u32 al_size_4k; /* cached product of the above */
+};
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
- struct file *lo_file;
- struct file *md_file;
struct drbd_md md;
- struct disk_conf dc; /* The user provided config... */
+ struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
sector_t known_size; /* last known size of that backing device */
};
struct drbd_md_io {
- struct drbd_conf *mdev;
- struct completion event;
+ unsigned int done;
int error;
};
struct bm_io_work {
struct drbd_work w;
char *why;
- int (*io_fn)(struct drbd_conf *mdev);
- void (*done)(struct drbd_conf *mdev, int rv);
+ enum bm_flag flags;
+ int (*io_fn)(struct drbd_device *device);
+ void (*done)(struct drbd_device *device, int rv);
};
enum write_ordering_e {
@@ -938,18 +550,143 @@ enum write_ordering_e {
};
struct fifo_buffer {
- int *values;
unsigned int head_index;
unsigned int size;
+ int total; /* sum of all values */
+ int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per connection */
+enum {
+ NET_CONGESTED, /* The data socket is congested */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
+ SEND_PING, /* whether asender should send a ping asap */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
+ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
+ CONN_WD_ST_CHG_OKAY,
+ CONN_WD_ST_CHG_FAIL,
+ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
+ CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
+};
+
+struct drbd_resource {
+ char *name;
+ struct kref kref;
+ struct idr devices; /* volume number to device mapping */
+ struct list_head connections;
+ struct list_head resources;
+ struct res_opts res_opts;
+ struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ struct mutex adm_mutex; /* mutex to serialize administrative requests */
+ spinlock_t req_lock;
+
+ unsigned susp:1; /* IO suspended by user */
+ unsigned susp_nod:1; /* IO suspended because no data */
+ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+
+ cpumask_var_t cpu_mask;
+};
+
+struct drbd_connection {
+ struct list_head connections;
+ struct drbd_resource *resource;
+ struct kref kref;
+ struct idr peer_devices; /* volume number to peer device mapping */
+ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+ struct mutex cstate_mutex; /* Protects graceful disconnects */
+ unsigned int connect_cnt; /* Inc each time a connection is established */
+
+ unsigned long flags;
+ struct net_conf *net_conf; /* content protected by rcu */
+ wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+
+ struct sockaddr_storage my_addr;
+ int my_addr_len;
+ struct sockaddr_storage peer_addr;
+ int peer_addr_len;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ u32 agreed_features;
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
+
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */
+ struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+ void *int_dig_in;
+ void *int_dig_vv;
+
+ /* receiver side */
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ enum write_ordering_e write_ordering;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned long last_reconnect_jif;
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
+};
+
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+};
+
+struct drbd_peer_device {
+ struct list_head peer_devices;
+ struct drbd_device *device;
+ struct drbd_connection *connection;
};
-struct drbd_conf {
+struct drbd_device {
+ struct drbd_resource *resource;
+ struct list_head peer_devices;
+ int vnr; /* volume number within the connection */
+ struct kref kref;
+
/* things that are stored as / read from meta data on disk */
unsigned long flags;
/* configured by drbdsetup */
- struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
- struct syncer_conf sync_conf;
struct drbd_backing_dev *ldev __protected_by(local);
sector_t p_size; /* partner's disk size */
@@ -957,17 +694,16 @@ struct drbd_conf {
struct block_device *this_bdev;
struct gendisk *vdisk;
- struct drbd_socket data; /* data/barrier/cstate/parameter packets */
- struct drbd_socket meta; /* ping/ack (metadata) packets */
- int agreed_pro_version; /* actually used protocol version */
- unsigned long last_received; /* in jiffies, either socket */
- unsigned int ko_count;
- struct drbd_work resync_work,
- unplug_work,
- go_diskless,
- md_sync_work;
+ unsigned long last_reattach_jif;
+ struct drbd_work resync_work;
+ struct drbd_work unplug_work;
+ struct drbd_work go_diskless;
+ struct drbd_work md_sync_work;
+ struct drbd_work start_resync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
+ struct timer_list start_resync_timer;
+ struct timer_list request_timer;
#ifdef DRBD_DEBUG_MD_SYNC
struct {
unsigned int line;
@@ -978,10 +714,9 @@ struct drbd_conf {
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
- union drbd_state state;
+ union drbd_dev_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
- wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
@@ -991,20 +726,16 @@ struct drbd_conf {
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
- atomic_t unacked_cnt; /* Need to send replys for */
+ atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */
- atomic_t net_cnt; /* Users of net_conf */
- spinlock_t req_lock;
- struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
- struct drbd_tl_epoch *newest_tle;
- struct drbd_tl_epoch *oldest_tle;
- struct list_head out_of_sequence_requests;
- struct hlist_head *tl_hash;
- unsigned int tl_hash_s;
-
- /* blocks to sync in this run [unit BM_BLOCK_SIZE] */
+
+ /* Interval tree of pending local requests */
+ struct rb_root read_requests;
+ struct rb_root write_requests;
+
+ /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
unsigned long rs_total;
- /* number of sync IOs that failed in this run */
+ /* number of resync blocks that failed in this run */
unsigned long rs_failed;
/* Syncer's start time [unit jiffies] */
unsigned long rs_start;
@@ -1020,9 +751,11 @@ struct drbd_conf {
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
+ sector_t ov_stop_sector;
/* where are we now? (sector) */
sector_t ov_position;
/* Start sector of out of sync range (to merge printk reporting). */
@@ -1030,12 +763,7 @@ struct drbd_conf {
/* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size;
unsigned long ov_left; /* in bits */
- struct crypto_hash *csums_tfm;
- struct crypto_hash *verify_tfm;
- struct drbd_thread receiver;
- struct drbd_thread worker;
- struct drbd_thread asender;
struct drbd_bitmap *bitmap;
unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
@@ -1048,52 +776,36 @@ struct drbd_conf {
int open_cnt;
u64 *p_uuid;
- struct drbd_epoch *current_epoch;
- spinlock_t epoch_lock;
- unsigned int epochs;
- enum write_ordering_e write_ordering;
+
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
- struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress (any read) */
+ struct list_head done_ee; /* need to send P_WRITE_ACK */
+ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
- struct hlist_head *ee_hash; /* is proteced by req_lock! */
- unsigned int ee_hash_s;
-
- /* this one is protected by ee_lock, single thread */
- struct drbd_epoch_entry *last_write_w_barrier;
int next_barrier_nr;
- struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
- struct page *md_io_tmpp; /* for logical_block_size != 512 */
- struct mutex md_io_mutex; /* protects the md_io_buffer */
+ struct drbd_md_io md_io;
+ atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
wait_queue_head_t al_wait;
struct lru_cache *act_log; /* activity log */
unsigned int al_tr_number;
int al_tr_cycle;
- int al_tr_pos; /* position of the next transaction in the journal */
- struct crypto_hash *cram_hmac_tfm;
- struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
- struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
- void *int_dig_out;
- void *int_dig_in;
- void *int_dig_vv;
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
- cpumask_var_t cpu_mask;
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
- struct mutex state_mutex;
+ struct mutex own_state_mutex;
+ struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
@@ -1101,48 +813,79 @@ struct drbd_conf {
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
- struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
- int rs_planed; /* resync sectors already planed */
-};
-
-static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
-{
- struct drbd_conf *mdev;
+ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
- mdev = minor < minor_count ? minor_table[minor] : NULL;
+ /* any requests that would block in drbd_make_request()
+ * are deferred to this single-threaded work queue */
+ struct submit_worker submit;
+};
- return mdev;
-}
+struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+#define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
+
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_device *device;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+};
-static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
+static inline struct drbd_device *minor_to_device(unsigned int minor)
{
- return mdev->minor;
+ return (struct drbd_device *)idr_find(&drbd_devices, minor);
}
-/* returns 1 if it was successfull,
- * returns 0 if there was no data socket.
- * so wherever you are going to use the data.socket, e.g. do
- * if (!drbd_get_data_sock(mdev))
- * return 0;
- * CODE();
- * drbd_put_data_sock(mdev);
- */
-static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
{
- mutex_lock(&mdev->data.mutex);
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (unlikely(mdev->data.socket == NULL)) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
- return 1;
+ return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
}
-static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+#define for_each_resource(resource, _resources) \
+ list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+ list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+ list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+ list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+ list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+ list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+ list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+ list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+ list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
{
- mutex_unlock(&mdev->data.mutex);
+ return device->minor;
}
/*
@@ -1151,147 +894,157 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev)
/* drbd_main.c */
-enum chg_state_flags {
- CS_HARD = 1,
- CS_VERBOSE = 2,
- CS_WAIT_COMPLETE = 4,
- CS_SERIALIZE = 8,
- CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
-};
-
enum dds_flags {
DDSF_FORCED = 1,
DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
};
-extern void drbd_init_set_defaults(struct drbd_conf *mdev);
-extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
- union drbd_state mask, union drbd_state val);
-extern void drbd_force_state(struct drbd_conf *, union drbd_state,
- union drbd_state);
-extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
- union drbd_state, enum chg_state_flags);
-extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
- enum chg_state_flags, struct completion *done);
-extern void print_st_err(struct drbd_conf *, union drbd_state,
- union drbd_state, int);
+extern void drbd_init_set_defaults(struct drbd_device *device);
extern int drbd_thread_start(struct drbd_thread *thi);
extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
#ifdef CONFIG_SMP
-extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
-extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
#else
#define drbd_thread_current_set_cpu(A) ({})
-#define drbd_calc_cpu_mask(A) ({})
#endif
-extern void drbd_free_resources(struct drbd_conf *mdev);
-extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
unsigned int set_size);
-extern void tl_clear(struct drbd_conf *mdev);
-enum drbd_req_event;
-extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
-extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
-extern void drbd_free_sock(struct drbd_conf *mdev);
-extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, unsigned msg_flags);
-extern int drbd_send_protocol(struct drbd_conf *mdev);
-extern int drbd_send_uuids(struct drbd_conf *mdev);
-extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
-extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
-extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
-extern int _drbd_send_state(struct drbd_conf *mdev);
-extern int drbd_send_state(struct drbd_conf *mdev);
-extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags);
-#define USE_DATA_SOCKET 1
-#define USE_META_SOCKET 0
-extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size);
-extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
- char *data, size_t size);
-extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
-extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
- u32 set_size);
-extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
-extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp);
-extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size);
-extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+ unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
sector_t sector, int blksize, u64 block_id);
-extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
-extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
-extern int _drbd_send_barrier(struct drbd_conf *mdev,
- struct drbd_tl_epoch *barrier);
-extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
sector_t sector, int size, u64 block_id);
-extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector,int size,
- void *digest, int digest_size,
- enum drbd_packets cmd);
-extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
-
-extern int drbd_send_bitmap(struct drbd_conf *mdev);
-extern int _drbd_send_bitmap(struct drbd_conf *mdev);
-extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+ int size, void *digest, int digest_size,
+ enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
extern void drbd_free_bc(struct drbd_backing_dev *ldev);
-extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
-
-/* drbd_meta-data.c (still in drbd_main.c) */
-extern void drbd_md_sync(struct drbd_conf *mdev);
-extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
-/* maybe define them below as inline? */
-extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
-extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
-extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
-extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
-extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
-extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
-extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
+extern void drbd_device_cleanup(struct drbd_device *device);
+void drbd_print_uuids(struct drbd_device *device, const char *text);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
#ifndef DRBD_DEBUG_MD_SYNC
-extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+extern void drbd_md_mark_dirty(struct drbd_device *device);
#else
#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
-extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+extern void drbd_md_mark_dirty_(struct drbd_device *device,
unsigned int line, const char *func);
#endif
-extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
- int (*io_fn)(struct drbd_conf *),
- void (*done)(struct drbd_conf *, int),
- char *why);
-extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
-extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
-extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
-extern void drbd_go_diskless(struct drbd_conf *mdev);
-extern void drbd_ldev_destroy(struct drbd_conf *mdev);
-
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device);
+extern void drbd_ldev_destroy(struct drbd_device *device);
/* Meta data layout
- We reserve a 128MB Block (4k aligned)
- * either at the end of the backing device
- * or on a separate meta data device. */
-
-#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
-/* The following numbers are sectors */
-#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
-#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
-/* Allows up to about 3.8TB */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
-
-/* Since the smalles IO unit is usually 512 byte */
-#define MD_SECTOR_SHIFT 9
-#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
-
-/* activity log */
-#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
-#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * Variants:
+ * old, indexed fixed size meta data:
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ * end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ * The activity log consists of 4k transaction blocks,
+ * which are written in a ring-buffer, or striped ring-buffer like fashion,
+ * which are writtensize used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
+#define MD_4kB_SECT 8
+#define MD_32kB_SECT 64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
+
#if BITS_PER_LONG == 32
#define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A)
@@ -1315,6 +1068,7 @@ struct bm_extent {
#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
/* drbd_bitmap.c */
/*
@@ -1326,11 +1080,14 @@ struct bm_extent {
#define SLEEP_TIME (HZ/10)
-#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
-/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
- * per sector of on disk bitmap */
-#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
@@ -1355,7 +1112,6 @@ struct bm_extent {
/* in one sector of the bitmap, we have this many activity_log extents. */
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
-#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1375,16 +1131,18 @@ struct bm_extent {
*/
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
-#define DRBD_MAX_SECTORS_BM \
- ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
-#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
-#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
-#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+ ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
#else
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
/* 16 TB in units of sectors */
#if BITS_PER_LONG == 32
/* adjust by one page worth of bitmap,
@@ -1392,217 +1150,335 @@ struct bm_extent {
* you should use 64bit OS for that much storage, anyways. */
#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
#else
-#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
#endif
#endif
-/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 8 all IO in one 128K block make it to the same slot of the
- * hash table. */
-#define HT_SHIFT 8
-#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
-
-#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
-
-/* Number of elements in the app_reads_hash */
-#define APP_R_HSIZE 15
-
-extern int drbd_bm_init(struct drbd_conf *mdev);
-extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
-extern void drbd_bm_cleanup(struct drbd_conf *mdev);
-extern void drbd_bm_set_all(struct drbd_conf *mdev);
-extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
+extern int drbd_bm_init(struct drbd_device *device);
+extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
extern int drbd_bm_set_bits(
- struct drbd_conf *mdev, unsigned long s, unsigned long e);
+ struct drbd_device *device, unsigned long s, unsigned long e);
extern int drbd_bm_clear_bits(
- struct drbd_conf *mdev, unsigned long s, unsigned long e);
-/* bm_set_bits variant for use while holding drbd_bm_lock */
-extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
+ struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+ struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
const unsigned long s, const unsigned long e);
-extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
-extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
-extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
-extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
-extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
-extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
- unsigned long al_enr);
-extern size_t drbd_bm_words(struct drbd_conf *mdev);
-extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
-extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
-extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
+extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP (~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
/* bm_find_next variants for use while you hold drbd_bm_lock() */
-extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
-extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
-extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
-extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
-extern int drbd_bm_rs_done(struct drbd_conf *mdev);
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+extern int drbd_bm_rs_done(struct drbd_device *device);
/* for receive_bitmap */
-extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
size_t number, unsigned long *buffer);
-/* for _drbd_send_bitmap and drbd_bm_write_sect */
-extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
size_t number, unsigned long *buffer);
-extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
-extern void drbd_bm_unlock(struct drbd_conf *mdev);
-
-extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
/* drbd_main.c */
extern struct kmem_cache *drbd_request_cache;
-extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
+extern struct kmem_cache *drbd_ee_cache; /* peer requests */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
extern mempool_t *drbd_ee_mempool;
-extern struct page *drbd_pp_pool; /* drbd's page pool */
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
extern spinlock_t drbd_pp_lock;
extern int drbd_pp_vacant;
extern wait_queue_head_t drbd_pp_wait;
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES 128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
extern rwlock_t global_state_lock;
-extern struct drbd_conf *drbd_new_device(unsigned int minor);
-extern void drbd_free_mdev(struct drbd_conf *mdev);
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
extern int proc_details;
/* drbd_req */
-extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
-extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */
-extern void drbd_suspend_io(struct drbd_conf *mdev);
-extern void drbd_resume_io(struct drbd_conf *mdev);
+extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
extern char *ppsize(char *buf, unsigned long long size);
-extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
-enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
-extern void resync_after_online_grow(struct drbd_conf *);
-extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
-extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
- int force);
-extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
-extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
-extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+ DS_ERROR_SHRINK = -3,
+ DS_ERROR_SPACE_MD = -2,
+ DS_ERROR = -1,
+ DS_UNCHANGED = 0,
+ DS_SHRUNK = 1,
+ DS_GREW = 2,
+ DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+ enum drbd_role new_role,
+ int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
extern int drbd_worker(struct drbd_thread *thi);
-extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
-extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
-extern void resume_next_sg(struct drbd_conf *mdev);
-extern void suspend_other_sg(struct drbd_conf *mdev);
-extern int drbd_resync_finished(struct drbd_conf *mdev);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
/* maybe rather drbd_main.c ? */
-extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
+extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
struct drbd_backing_dev *bdev, sector_t sector, int rw);
-extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
-static inline void ov_oos_print(struct drbd_conf *mdev)
+static inline void ov_out_of_sync_print(struct drbd_device *device)
{
- if (mdev->ov_last_oos_size) {
- dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
- (unsigned long long)mdev->ov_last_oos_start,
- (unsigned long)mdev->ov_last_oos_size);
+ if (device->ov_last_oos_size) {
+ drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+ (unsigned long long)device->ov_last_oos_start,
+ (unsigned long)device->ov_last_oos_size);
}
- mdev->ov_last_oos_size=0;
+ device->ov_last_oos_size = 0;
}
-extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
-extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *);
/* worker callbacks */
-extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
-extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
-extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
-extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */
-extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
-extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type);
-extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
-extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- int is_net);
-#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
-#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
-extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
-extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
-extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
-extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
-extern void drbd_flush_workqueue(struct drbd_conf *mdev);
-extern void drbd_free_tl_hash(struct drbd_conf *mdev);
-
-/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
- * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_asender(struct drbd_thread *thi);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern int drbd_submit_peer_request(struct drbd_device *,
+ struct drbd_peer_request *, const unsigned,
+ const int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+ sector_t, unsigned int,
+ bool,
+ gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+ int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+/* Yes, there is kernel_setsockopt, but only since 2.6.18.
+ * So we have our own copy of it here. */
static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char *optval, int optlen)
{
+ mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
int err;
+
+ uoptval = (char __user __force *)optval;
+
+ set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
- err = sock_setsockopt(sock, level, optname, optval, optlen);
+ err = sock_setsockopt(sock, level, optname, uoptval, optlen);
else
- err = sock->ops->setsockopt(sock, level, optname, optval,
+ err = sock->ops->setsockopt(sock, level, optname, uoptval,
optlen);
+ set_fs(oldfs);
return err;
}
static inline void drbd_tcp_cork(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_uncork(struct socket *sock)
{
- int __user val = 0;
+ int val = 0;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_nodelay(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_quickack(struct socket *sock)
{
- int __user val = 2;
+ int val = 2;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+ sector_t size)
+{
+ /* set_capacity(device->this_bdev->bd_disk, size); */
+ set_capacity(device->vdisk, size);
+ device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+ int fault_type, struct bio *bio)
+{
+ __release(local);
+ if (!bio->bi_bdev) {
+ printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+ "bio->bi_bdev == NULL\n",
+ device_to_minor(device));
+ dump_stack();
+ bio_endio(bio, -ENODEV);
+ return;
+ }
+
+ if (drbd_insert_fault(device, fault_type))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
@@ -1611,92 +1487,47 @@ extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
-extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
-extern int drbd_rs_del_all(struct drbd_conf *mdev);
-extern void drbd_rs_failed_io(struct drbd_conf *mdev,
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
sector_t sector, int size);
-extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
-extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
int size, const char *file, const unsigned int line);
-#define drbd_set_in_sync(mdev, sector, size) \
- __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+#define drbd_set_in_sync(device, sector, size) \
+ __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
+extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
int size, const char *file, const unsigned int line);
-#define drbd_set_out_of_sync(mdev, sector, size) \
- __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
-extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
-extern void drbd_al_shrink(struct drbd_conf *mdev);
-
+#define drbd_set_out_of_sync(device, sector, size) \
+ __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_initialize_al(struct drbd_device *, void *);
/* drbd_nl.c */
-
-void drbd_nl_cleanup(void);
-int __init drbd_nl_init(void);
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
-void drbd_bcast_sync_progress(struct drbd_conf *mdev);
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e);
-
-
-/**
- * DOC: DRBD State macros
- *
- * These macros are used to express state changes in easily readable form.
- *
- * The NS macros expand to a mask and a value, that can be bit ored onto the
- * current state as soon as the spinlock (req_lock) was taken.
- *
- * The _NS macros are used for state functions that get called with the
- * spinlock. These macros expand directly to the new state value.
- *
- * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
- * to express state changes that affect more than one aspect of the state.
- *
- * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
- * Means that the network connection was established and that the peer
- * is in secondary role.
- */
-#define role_MASK R_MASK
-#define peer_MASK R_MASK
-#define disk_MASK D_MASK
-#define pdsk_MASK D_MASK
-#define conn_MASK C_MASK
-#define susp_MASK 1
-#define user_isp_MASK 1
-#define aftr_isp_MASK 1
-#define susp_nod_MASK 1
-#define susp_fen_MASK 1
-
-#define NS(T, S) \
- ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T = (S); val; })
-#define NS2(T1, S1, T2, S2) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val; })
-#define NS3(T1, S1, T2, S2, T3, S3) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val.T3 = (S3); val; })
-
-#define _NS(D, T, S) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
-#define _NS2(D, T1, S1, T2, S2) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns; })
-#define _NS3(D, T1, S1, T2, S2, T3, S3) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+/* state info broadcast */
+struct sib_info {
+ enum drbd_state_info_bcast_reason sib_reason;
+ union {
+ struct {
+ char *helper_name;
+ unsigned helper_exit_code;
+ };
+ struct {
+ union drbd_state os;
+ union drbd_state ns;
+ };
+ };
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
/*
* inline helper functions
@@ -1713,22 +1544,10 @@ static inline struct page *page_chain_next(struct page *page)
#define page_chain_for_each_safe(page, n) \
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (page_count(bvec->bv_page) > 1)
- return 1;
- }
- return 0;
-}
-
-static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
page_chain_for_each(page) {
if (page_count(page) > 1)
return 1;
@@ -1736,66 +1555,89 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
return 0;
}
-
-static inline void drbd_state_lock(struct drbd_conf *mdev)
-{
- wait_event(mdev->misc_wait,
- !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
-}
-
-static inline void drbd_state_unlock(struct drbd_conf *mdev)
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
{
- clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
- wake_up(&mdev->misc_wait);
-}
-
-static inline int _drbd_set_state(struct drbd_conf *mdev,
- union drbd_state ns, enum chg_state_flags flags,
- struct completion *done)
-{
- int rv;
+ enum drbd_state_rv rv;
read_lock(&global_state_lock);
- rv = __drbd_set_state(mdev, ns, flags, done);
+ rv = __drbd_set_state(device, ns, flags, done);
read_unlock(&global_state_lock);
return rv;
}
-/**
- * drbd_request_state() - Reqest a state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- *
- * This is the most graceful way of requesting a state change. It is verbose
- * quite verbose in case the state change is not possible, and all those
- * state changes are globally serialized.
- */
-static inline int drbd_request_state(struct drbd_conf *mdev,
- union drbd_state mask,
- union drbd_state val)
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
{
- return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+ struct drbd_resource *resource = device->resource;
+ union drbd_state rv;
+
+ rv.i = device->state.i;
+ rv.susp = resource->susp;
+ rv.susp_nod = resource->susp_nod;
+ rv.susp_fen = resource->susp_fen;
+
+ return rv;
}
+enum drbd_force_detach_flags {
+ DRBD_READ_ERROR,
+ DRBD_WRITE_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+
#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
-static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
-{
- switch (mdev->ldev->dc.on_io_error) {
- case EP_PASS_ON:
- if (!forcedetach) {
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+ enum drbd_force_detach_flags df,
+ const char *where)
+{
+ enum drbd_io_error_p ep;
+
+ rcu_read_lock();
+ ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+ switch (ep) {
+ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Local IO failed in %s.\n", where);
+ drbd_err(device, "Local IO failed in %s.\n", where);
+ if (device->state.disk > D_INCONSISTENT)
+ _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
break;
}
- /* NOTE fall through to detach case if forcedetach set */
+ /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
case EP_DETACH:
case EP_CALL_HELPER:
- set_bit(WAS_IO_ERROR, &mdev->flags);
- if (mdev->state.disk > D_FAILED) {
- _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
- dev_err(DEV,
+ /* Remember whether we saw a READ or WRITE error.
+ *
+ * Recovery of the affected area for WRITE failure is covered
+ * by the activity log.
+ * READ errors may fall outside that area though. Certain READ
+ * errors can be "healed" by writing good data to the affected
+ * blocks, which triggers block re-allocation in lower layers.
+ *
+ * If we can not write the bitmap after a READ error,
+ * we may need to trigger a full sync (see w_go_diskless()).
+ *
+ * Force-detach is not really an IO error, but rather a
+ * desperate measure to try to deal with a completely
+ * unresponsive lower level IO stack.
+ * Still it should be treated as a WRITE error.
+ *
+ * Meta IO error is always WRITE error:
+ * we read meta data only once during attach,
+ * which will fail in case of errors.
+ */
+ set_bit(WAS_IO_ERROR, &device->flags);
+ if (df == DRBD_READ_ERROR)
+ set_bit(WAS_READ_ERROR, &device->flags);
+ if (df == DRBD_FORCE_DETACH)
+ set_bit(FORCE_DETACH, &device->flags);
+ if (device->state.disk > D_FAILED) {
+ _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+ drbd_err(device,
"Local IO failed in %s. Detaching...\n", where);
}
break;
@@ -1804,21 +1646,21 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
/**
* drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @error: Error code passed to the IO completion callback
* @forcedetach: Force detach. I.e. the error happened while accessing the meta data
*
* See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
*/
#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
-static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
- int error, int forcedetach, const char *where)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+ int error, enum drbd_force_detach_flags forcedetach, const char *where)
{
if (error) {
unsigned long flags;
- spin_lock_irqsave(&mdev->req_lock, flags);
- __drbd_chk_io_error_(mdev, forcedetach, where);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ __drbd_chk_io_error_(device, forcedetach, where);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
}
}
@@ -1832,7 +1674,7 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
*/
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1848,13 +1690,13 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- return bdev->md.md_offset + MD_AL_OFFSET - 1;
+ return bdev->md.md_offset + MD_4kB_SECT -1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
- return bdev->md.md_offset + bdev->md.md_size_sect;
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
}
}
@@ -1876,12 +1718,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
- switch (bdev->dc.meta_dev_idx) {
+
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- drbd_md_first_sector(bdev))
+ drbd_md_first_sector(bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
@@ -1900,33 +1743,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
}
/**
- * drbd_md_ss__() - Return the sector number of our meta data super block
- * @mdev: DRBD device.
+ * drbd_md_ss() - Return the sector number of our meta data super block
* @bdev: Meta data block device.
*/
-static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
- default: /* external, some index */
- return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
- case DRBD_MD_INDEX_INTERNAL:
- /* with drbd08, internal meta data is always "flexible" */
- case DRBD_MD_INDEX_FLEX_INT:
- /* sizeof(struct md_on_disk_07) == 4k
- * position: last 4k aligned block of 4k size */
- if (!bdev->backing_bdev) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "bdev->backing_bdev==NULL\n");
- dump_stack();
- }
- return 0;
- }
- return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
- - MD_AL_OFFSET;
- case DRBD_MD_INDEX_FLEX_EXT:
+ const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+ if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
return 0;
- }
+
+ /* Since drbd08, internal meta data is always "flexible".
+ * position: last 4k aligned block of 4k size */
+ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+ /* external, some index; this is the old fixed size layout */
+ return MD_128MB_SECT * bdev->md.meta_dev_idx;
}
static inline void
@@ -1935,9 +1769,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
static inline void
@@ -1946,55 +1779,51 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
-static inline void wake_asender(struct drbd_conf *mdev)
-{
- if (test_bit(SIGNAL_ASENDER, &mdev->flags))
- force_sig(DRBD_SIG, mdev->asender.task);
-}
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
-static inline void request_ping(struct drbd_conf *mdev)
+static inline void wake_asender(struct drbd_connection *connection)
{
- set_bit(SEND_PING, &mdev->flags);
- wake_asender(mdev);
+ if (test_bit(SIGNAL_ASENDER, &connection->flags))
+ force_sig(DRBD_SIG, connection->asender.task);
}
-static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
- enum drbd_packets cmd)
+static inline void request_ping(struct drbd_connection *connection)
{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+ set_bit(SEND_PING, &connection->flags);
+ wake_asender(connection);
}
-static inline int drbd_send_ping(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
-}
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
-static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
-}
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
- _drbd_thread_stop(thi, FALSE, TRUE);
+ _drbd_thread_stop(thi, false, true);
}
static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
{
- _drbd_thread_stop(thi, FALSE, FALSE);
+ _drbd_thread_stop(thi, false, false);
}
static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
{
- _drbd_thread_stop(thi, TRUE, FALSE);
+ _drbd_thread_stop(thi, true, false);
}
/* counts how many answer packets packets we expect from our peer,
@@ -2002,55 +1831,59 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
* or implicit barrier packets as necessary.
* increased:
* w_send_barrier
- * _req_mod(req, queue_for_net_write or queue_for_net_read);
+ * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
* it is much easier and equally valid to count what we queue for the
* worker, even before it actually was queued or send.
* (drbd_make_request_common; recovery path on read io-error)
* decreased:
* got_BarrierAck (respective tl_clear, tl_clear_barrier)
- * _req_mod(req, data_received)
+ * _req_mod(req, DATA_RECEIVED)
* [from receive_DataReply]
- * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
* [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
* for some reason it is NOT decreased in got_NegAck,
* but in the resulting cleanup code from report_params.
* we should try to remember the reason for that...
- * _req_mod(req, send_failed or send_canceled)
- * _req_mod(req, connection_lost_while_pending)
+ * _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
* [from tl_clear_barrier]
*/
-static inline void inc_ap_pending(struct drbd_conf *mdev)
+static inline void inc_ap_pending(struct drbd_device *device)
{
- atomic_inc(&mdev->ap_pending_cnt);
+ atomic_inc(&device->ap_pending_cnt);
}
-#define ERR_IF_CNT_IS_NEGATIVE(which) \
- if (atomic_read(&mdev->which) < 0) \
- dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
- __func__ , __LINE__ , \
- atomic_read(&mdev->which))
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
+ if (atomic_read(&device->which) < 0) \
+ drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \
+ func, line, \
+ atomic_read(&device->which))
-#define dec_ap_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
- wake_up(&mdev->misc_wait); \
- ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+ if (atomic_dec_and_test(&device->ap_pending_cnt))
+ wake_up(&device->misc_wait);
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
/* counts how many resync-related answers we still expect from the peer
* increase decrease
* C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
- * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
* (or P_NEG_ACK with ID_SYNCER)
*/
-static inline void inc_rs_pending(struct drbd_conf *mdev)
+static inline void inc_rs_pending(struct drbd_device *device)
{
- atomic_inc(&mdev->rs_pending_cnt);
+ atomic_inc(&device->rs_pending_cnt);
}
-#define dec_rs_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->rs_pending_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+ atomic_dec(&device->rs_pending_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
/* counts how many answers we still need to send to the peer.
* increased on
@@ -2061,122 +1894,119 @@ static inline void inc_rs_pending(struct drbd_conf *mdev)
* receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
* receive_Barrier_* we need to send a P_BARRIER_ACK
*/
-static inline void inc_unacked(struct drbd_conf *mdev)
+static inline void inc_unacked(struct drbd_device *device)
{
- atomic_inc(&mdev->unacked_cnt);
+ atomic_inc(&device->unacked_cnt);
}
-#define dec_unacked(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-#define sub_unacked(mdev, n) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_sub(n, &mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-
-static inline void put_net_conf(struct drbd_conf *mdev)
+#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
{
- if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->net_cnt_wait);
+ atomic_dec(&device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
-/**
- * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
- * @mdev: DRBD device.
- *
- * You have to call put_net_conf() when finished working with mdev->net_conf.
- */
-static inline int get_net_conf(struct drbd_conf *mdev)
+#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
{
- int have_net_conf;
-
- atomic_inc(&mdev->net_cnt);
- have_net_conf = mdev->state.conn >= C_UNCONNECTED;
- if (!have_net_conf)
- put_net_conf(mdev);
- return have_net_conf;
+ atomic_sub(n, &device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
/**
- * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
* @M: DRBD device.
*
- * You have to call put_ldev() when finished working with mdev->ldev.
+ * You have to call put_ldev() when finished working with device->ldev.
*/
#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
-static inline void put_ldev(struct drbd_conf *mdev)
+static inline void put_ldev(struct drbd_device *device)
{
- int i = atomic_dec_return(&mdev->local_cnt);
+ int i = atomic_dec_return(&device->local_cnt);
+
+ /* This may be called from some endio handler,
+ * so we must not sleep here. */
+
__release(local);
- D_ASSERT(i >= 0);
+ D_ASSERT(device, i >= 0);
if (i == 0) {
- if (mdev->state.disk == D_DISKLESS)
+ if (device->state.disk == D_DISKLESS)
/* even internal references gone, safe to destroy */
- drbd_ldev_destroy(mdev);
- if (mdev->state.disk == D_FAILED)
+ drbd_ldev_destroy(device);
+ if (device->state.disk == D_FAILED) {
/* all application IO references gone. */
- drbd_go_diskless(mdev);
- wake_up(&mdev->misc_wait);
+ if (!test_and_set_bit(GO_DISKLESS, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->go_diskless);
+ }
+ wake_up(&device->misc_wait);
}
}
#ifndef __CHECKER__
-static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
{
int io_allowed;
/* never get a reference while D_DISKLESS */
- if (mdev->state.disk == D_DISKLESS)
+ if (device->state.disk == D_DISKLESS)
return 0;
- atomic_inc(&mdev->local_cnt);
- io_allowed = (mdev->state.disk >= mins);
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
if (!io_allowed)
- put_ldev(mdev);
+ put_ldev(device);
return io_allowed;
}
#else
-extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
#endif
/* you must have an "get_ldev" reference */
-static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
+static inline void drbd_get_syncer_progress(struct drbd_device *device,
unsigned long *bits_left, unsigned int *per_mil_done)
{
- /*
- * this is to break it at compile time when we change that
- * (we may feel 4TB maximum storage per drbd is not enough)
- */
- typecheck(unsigned long, mdev->rs_total);
+ /* this is to break it at compile time when we change that, in case we
+ * want to support more than (1<<32) bits on a 32bit arch. */
+ typecheck(unsigned long, device->rs_total);
/* note: both rs_total and rs_left are in bits, i.e. in
* units of BM_BLOCK_SIZE.
* for the percentage, we don't care. */
- *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ *bits_left = device->ov_left;
+ else
+ *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
/* >> 10 to prevent overflow,
* +1 to prevent division by zero */
- if (*bits_left > mdev->rs_total) {
+ if (*bits_left > device->rs_total) {
/* doh. maybe a logic bug somewhere.
* may also be just a race condition
* between this and a disconnect during sync.
* for now, just prevent in-kernel buffer overflow.
*/
smp_rmb();
- dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
- drbd_conn_str(mdev->state.conn),
- *bits_left, mdev->rs_total, mdev->rs_failed);
+ drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+ drbd_conn_str(device->state.conn),
+ *bits_left, device->rs_total, device->rs_failed);
*per_mil_done = 0;
} else {
- /* make sure the calculation happens in long context */
- unsigned long tmp = 1000UL -
- (*bits_left >> 10)*1000UL
- / ((mdev->rs_total >> 10) + 1UL);
+ /* Make sure the division happens in long context.
+ * We allow up to one petabyte storage right now,
+ * at a granularity of 4k per bit that is 2**38 bits.
+ * After shift right and multiplication by 1000,
+ * this should still fit easily into a 32bit long,
+ * so we don't need a 64bit division on 32bit arch.
+ * Note: currently we don't support such large bitmaps on 32bit
+ * arch anyways, but no harm done to be prepared for it here.
+ */
+ unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
+ unsigned long left = *bits_left >> shift;
+ unsigned long total = 1UL + (device->rs_total >> shift);
+ unsigned long tmp = 1000UL - left * 1000UL/total;
*per_mil_done = tmp;
}
}
@@ -2185,18 +2015,22 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
/* this throttles on-the-fly application requests
* according to max_buffers settings;
* maybe re-implement using semaphores? */
-static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
+static inline int drbd_get_max_buffers(struct drbd_device *device)
{
- int mxb = 1000000; /* arbitrary limit on open requests */
- if (get_net_conf(mdev)) {
- mxb = mdev->net_conf->max_buffers;
- put_net_conf(mdev);
- }
+ struct net_conf *nc;
+ int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
+ rcu_read_unlock();
+
return mxb;
}
-static inline int drbd_state_is_stable(union drbd_state s)
+static inline int drbd_state_is_stable(struct drbd_device *device)
{
+ union drbd_dev_state s = device->state;
/* DO NOT add a default clause, we want the compiler to warn us
* for any newly introduced state we may have forgotten to add here */
@@ -2213,11 +2047,9 @@ static inline int drbd_state_is_stable(union drbd_state s)
case C_VERIFY_T:
case C_PAUSED_SYNC_S:
case C_PAUSED_SYNC_T:
- /* maybe stable, look at the disk state */
- break;
-
- /* no new io accepted during tansitional states
- * like handshake or teardown */
+ case C_AHEAD:
+ case C_BEHIND:
+ /* transitional states, IO allowed */
case C_DISCONNECTING:
case C_UNCONNECTED:
case C_TIMEOUT:
@@ -2228,7 +2060,15 @@ static inline int drbd_state_is_stable(union drbd_state s)
case C_WF_REPORT_PARAMS:
case C_STARTING_SYNC_S:
case C_STARTING_SYNC_T:
+ break;
+
+ /* Allow IO in BM exchange states with new protocols */
case C_WF_BITMAP_S:
+ if (first_peer_device(device)->connection->agreed_pro_version < 96)
+ return 0;
+ break;
+
+ /* no new io accepted in these states */
case C_WF_BITMAP_T:
case C_WF_SYNC_UUID:
case C_MASK:
@@ -2242,12 +2082,12 @@ static inline int drbd_state_is_stable(union drbd_state s)
case D_OUTDATED:
case D_CONSISTENT:
case D_UP_TO_DATE:
+ case D_FAILED:
/* disk state is stable as well. */
break;
- /* no new io accepted during tansitional states */
+ /* no new io accepted during transitional states */
case D_ATTACHING:
- case D_FAILED:
case D_NEGOTIATING:
case D_UNKNOWN:
case D_MASK:
@@ -2258,123 +2098,100 @@ static inline int drbd_state_is_stable(union drbd_state s)
return 1;
}
-static inline int is_susp(union drbd_state s)
+static inline int drbd_suspended(struct drbd_device *device)
{
- return s.susp || s.susp_nod || s.susp_fen;
+ struct drbd_resource *resource = device->resource;
+
+ return resource->susp || resource->susp_fen || resource->susp_nod;
}
-static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
+static inline bool may_inc_ap_bio(struct drbd_device *device)
{
- int mxb = drbd_get_max_buffers(mdev);
+ int mxb = drbd_get_max_buffers(device);
- if (is_susp(mdev->state))
- return 0;
- if (test_bit(SUSPEND_IO, &mdev->flags))
- return 0;
+ if (drbd_suspended(device))
+ return false;
+ if (test_bit(SUSPEND_IO, &device->flags))
+ return false;
/* to avoid potential deadlock or bitmap corruption,
* in various places, we only allow new application io
* to start during "stable" states. */
/* no new io accepted when attaching or detaching the disk */
- if (!drbd_state_is_stable(mdev->state))
- return 0;
+ if (!drbd_state_is_stable(device))
+ return false;
/* since some older kernels don't have atomic_add_unless,
* and we are within the spinlock anyways, we have this workaround. */
- if (atomic_read(&mdev->ap_bio_cnt) > mxb)
- return 0;
- if (test_bit(BITMAP_IO, &mdev->flags))
- return 0;
- return 1;
+ if (atomic_read(&device->ap_bio_cnt) > mxb)
+ return false;
+ if (test_bit(BITMAP_IO, &device->flags))
+ return false;
+ return true;
}
-/* I'd like to use wait_event_lock_irq,
- * but I'm not sure when it got introduced,
- * and not sure when it has 3 or 4 arguments */
-static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
{
- /* compare with after_state_ch,
- * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
- DEFINE_WAIT(wait);
+ bool rv = false;
+
+ spin_lock_irq(&device->resource->req_lock);
+ rv = may_inc_ap_bio(device);
+ if (rv)
+ atomic_inc(&device->ap_bio_cnt);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
+}
+static inline void inc_ap_bio(struct drbd_device *device)
+{
/* we wait here
* as long as the device is suspended
* until the bitmap is no longer on the fly during connection
- * handshake as long as we would exeed the max_buffer limit.
+ * handshake as long as we would exceed the max_buffer limit.
*
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
- spin_lock_irq(&mdev->req_lock);
- while (!__inc_ap_bio_cond(mdev)) {
- prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mdev->req_lock);
- schedule();
- finish_wait(&mdev->misc_wait, &wait);
- spin_lock_irq(&mdev->req_lock);
- }
- atomic_add(count, &mdev->ap_bio_cnt);
- spin_unlock_irq(&mdev->req_lock);
+ wait_event(device->misc_wait, inc_ap_bio_cond(device));
}
-static inline void dec_ap_bio(struct drbd_conf *mdev)
+static inline void dec_ap_bio(struct drbd_device *device)
{
- int mxb = drbd_get_max_buffers(mdev);
- int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
+ int mxb = drbd_get_max_buffers(device);
+ int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+ D_ASSERT(device, ap_bio >= 0);
+
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->
+ connection->sender_work,
+ &device->bm_io_work.w);
+ }
- D_ASSERT(ap_bio >= 0);
/* this currently does wake_up for every dec_ap_bio!
* maybe rather introduce some type of hysteresis?
* e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
if (ap_bio < mxb)
- wake_up(&mdev->misc_wait);
- if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
- if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
- }
+ wake_up(&device->misc_wait);
}
-static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
-{
- mdev->ed_uuid = val;
-}
-
-static inline int seq_cmp(u32 a, u32 b)
-{
- /* we assume wrap around at 32bit.
- * for wrap around at 24bit (old atomic_t),
- * we'd have to
- * a <<= 8; b <<= 8;
- */
- return (s32)(a) - (s32)(b);
-}
-#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
-#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
-#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
-#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
-/* CAUTION: please no side effects in arguments! */
-#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
-
-static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
{
- unsigned int m;
- spin_lock(&mdev->peer_seq_lock);
- m = seq_max(mdev->peer_seq, new_seq);
- mdev->peer_seq = m;
- spin_unlock(&mdev->peer_seq_lock);
- if (m == new_seq)
- wake_up(&mdev->seq_wait);
+ return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+ first_peer_device(device)->connection->agreed_pro_version != 100;
}
-static inline void drbd_update_congested(struct drbd_conf *mdev)
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
{
- struct sock *sk = mdev->data.socket->sk;
- if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
- set_bit(NET_CONGESTED, &mdev->flags);
+ int changed = device->ed_uuid != val;
+ device->ed_uuid = val;
+ return changed;
}
-static inline int drbd_queue_order_type(struct drbd_conf *mdev)
+static inline int drbd_queue_order_type(struct drbd_device *device)
{
/* sorry, we currently have no working implementation
* of distributed TCQ stuff */
@@ -2384,32 +2201,29 @@ static inline int drbd_queue_order_type(struct drbd_conf *mdev)
return QUEUE_ORDERED_NONE;
}
-static inline void drbd_blk_run_queue(struct request_queue *q)
+static inline void drbd_md_flush(struct drbd_device *device)
{
- if (q && q->unplug_fn)
- q->unplug_fn(q);
-}
+ int r;
-static inline void drbd_kick_lo(struct drbd_conf *mdev)
-{
- if (get_ldev(mdev)) {
- drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
- put_ldev(mdev);
+ if (device->ldev == NULL) {
+ drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
+ return;
}
-}
-static inline void drbd_md_flush(struct drbd_conf *mdev)
-{
- int r;
-
- if (test_bit(MD_NO_FUA, &mdev->flags))
+ if (test_bit(MD_NO_FUA, &device->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
+ r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
if (r) {
- set_bit(MD_NO_FUA, &mdev->flags);
- dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
+ set_bit(MD_NO_FUA, &device->flags);
+ drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+ return list_first_entry_or_null(&resource->connections,
+ struct drbd_connection, connections);
+}
+
#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 00000000000..89c497c630b
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,207 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end - return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+}
+
+/**
+ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+ sector_t max = node->sector + (node->size >> 9);
+
+ if (node->rb.rb_left) {
+ sector_t left = interval_end(node->rb.rb_left);
+ if (left > max)
+ max = left;
+ }
+ if (node->rb.rb_right) {
+ sector_t right = interval_end(node->rb.rb_right);
+ if (right > max)
+ max = right;
+ }
+ return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+ while (rb != stop) {
+ struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
+ sector_t subtree_last = compute_subtree_last(node);
+ if (node->end == subtree_last)
+ break;
+ node->end = subtree_last;
+ rb = rb_parent(&node->rb);
+ }
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+ old->end = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+ augment_propagate,
+ augment_copy,
+ augment_rotate,
+};
+
+/**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&this->rb, parent, new);
+ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+}
+
+/**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+}
+
+/**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 00000000000..f38fcb00c10
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,40 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+ struct rb_node rb;
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ int local:1 /* local or remote request? */;
+ int waiting:1;
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+ RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+ return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+ struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+ unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+ unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size) \
+ for (i = drbd_find_overlap(root, sector, size); \
+ i; \
+ i = drbd_next_overlap(i, sector, size))
+
+#endif /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 6be5401d0e8..960645c26e6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,47 +45,33 @@
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
-
+#include <linux/workqueue.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <linux/vmalloc.h>
#include <linux/drbd_limits.h>
#include "drbd_int.h"
+#include "drbd_protocol.h"
#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
#include "drbd_vli.h"
-struct after_state_chg_work {
- struct drbd_work w;
- union drbd_state os;
- union drbd_state ns;
- enum chg_state_flags flags;
- struct completion *done;
-};
-
static DEFINE_MUTEX(drbd_main_mutex);
-int drbdd_init(struct drbd_thread *);
-int drbd_worker(struct drbd_thread *);
-int drbd_asender(struct drbd_thread *);
-
-int drbd_init(void);
static int drbd_open(struct block_device *bdev, fmode_t mode);
-static int drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags);
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static int w_md_sync(struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
MODULE_VERSION(REL_VERSION);
MODULE_LICENSE("GPL");
-MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+ __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
#include <linux/moduleparam.h>
@@ -96,7 +82,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!");
module_param(minor_count, uint, 0444);
module_param(disable_sendpage, bool, 0644);
module_param(allow_oos, bool, 0);
-module_param(cn_idx, uint, 0444);
module_param(proc_details, int, 0644);
#ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -115,10 +100,9 @@ module_param(fault_devs, int, 0644);
#endif
/* module parameter, defined */
-unsigned int minor_count = 32;
-int disable_sendpage;
-int allow_oos;
-unsigned int cn_idx = CN_IDX_DRBD;
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+bool disable_sendpage;
+bool allow_oos;
int proc_details; /* Detail level in proc drbd*/
/* Module parameter for setting the user mode helper program
@@ -130,14 +114,17 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
/* in 2.6.x, our device mapping and config info contains our virtual gendisks
* as member "struct gendisk *vdisk;"
*/
-struct drbd_conf **minor_table;
+struct idr drbd_devices;
+struct list_head drbd_resources;
struct kmem_cache *drbd_request_cache;
-struct kmem_cache *drbd_ee_cache; /* epoch entries */
+struct kmem_cache *drbd_ee_cache; /* peer requests */
struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t *drbd_request_mempool;
mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
/* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
@@ -158,21 +145,32 @@ static const struct block_device_operations drbd_ops = {
.release = drbd_release,
};
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+ struct bio *bio;
+
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
+
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ return bio;
+}
#ifdef __CHECKER__
/* When checking with sparse, and this is an inline function, sparse will
give tons of false positives. When this is a real functions sparse works.
*/
-int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
{
int io_allowed;
- atomic_inc(&mdev->local_cnt);
- io_allowed = (mdev->state.disk >= mins);
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
if (!io_allowed) {
- if (atomic_dec_and_test(&mdev->local_cnt))
- wake_up(&mdev->misc_wait);
+ if (atomic_dec_and_test(&device->local_cnt))
+ wake_up(&device->misc_wait);
}
return io_allowed;
}
@@ -180,1423 +178,272 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
#endif
/**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
-static int tl_init(struct drbd_conf *mdev)
-{
- struct drbd_tl_epoch *b;
-
- /* during device minor initialization, we may well use GFP_KERNEL */
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
- if (!b)
- return 0;
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->next = NULL;
- b->br_number = 4711;
- b->n_writes = 0;
- b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
- mdev->oldest_tle = b;
- mdev->newest_tle = b;
- INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
-
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-
- return 1;
-}
-
-static void tl_cleanup(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
- kfree(mdev->oldest_tle);
- mdev->oldest_tle = NULL;
- kfree(mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-}
-
-/**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev: DRBD device.
- * @new: Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
-void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
-{
- struct drbd_tl_epoch *newest_before;
-
- INIT_LIST_HEAD(&new->requests);
- INIT_LIST_HEAD(&new->w.list);
- new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
- new->next = NULL;
- new->n_writes = 0;
-
- newest_before = mdev->newest_tle;
- /* never send a barrier number == 0, because that is special-cased
- * when using TCQ for our write ordering code */
- new->br_number = (newest_before->br_number+1) ?: 1;
- if (mdev->newest_tle != new) {
- mdev->newest_tle->next = new;
- mdev->newest_tle = new;
- }
-}
-
-/**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev: DRBD device.
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection: DRBD connection.
* @barrier_nr: Expected identifier of the DRBD write barrier packet.
* @set_size: Expected number of requests before that barrier.
*
* In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
*/
-void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
- unsigned int set_size)
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+ unsigned int set_size)
{
- struct drbd_tl_epoch *b, *nob; /* next old barrier */
- struct list_head *le, *tle;
struct drbd_request *r;
-
- spin_lock_irq(&mdev->req_lock);
-
- b = mdev->oldest_tle;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
+
+ spin_lock_irq(&connection->resource->req_lock);
+
+ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
/* first some paranoia code */
- if (b == NULL) {
- dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
- barrier_nr);
+ if (req == NULL) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
goto bail;
}
- if (b->br_number != barrier_nr) {
- dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
- barrier_nr, b->br_number);
+ if (expect_epoch != barrier_nr) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
goto bail;
}
- if (b->n_writes != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
- barrier_nr, set_size, b->n_writes);
+
+ if (expect_size != set_size) {
+ drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
goto bail;
}
- /* Clean up list of requests processed during current epoch */
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(r, barrier_acked);
- }
- /* There could be requests on the list waiting for completion
- of the write to the local disk. To avoid corruptions of
- slab's data structures we have to remove the lists head.
-
- Also there could have been a barrier ack out of sequence, overtaking
- the write acks - which would be a bug and violating write ordering.
- To not deadlock in case we lose connection while such requests are
- still pending, we need some way to find them for the
- _req_mode(connection_lost_while_pending).
-
- These have been list_move'd to the out_of_sequence_requests list in
- _req_mod(, barrier_acked) above.
- */
- list_del_init(&b->requests);
-
- nob = b->next;
- if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
- _tl_add_barrier(mdev, b);
- if (nob)
- mdev->oldest_tle = nob;
- /* if nob == NULL b was the only barrier, and becomes the new
- barrier. Therefore mdev->oldest_tle points already to b */
- } else {
- D_ASSERT(nob != NULL);
- mdev->oldest_tle = nob;
- kfree(b);
+ /* Clean up list of requests processed during current epoch. */
+ /* this extra list walk restart is paranoia,
+ * to catch requests being barrier-acked "unexpectedly".
+ * It usually should find the same req again, or some READ preceding it. */
+ list_for_each_entry(req, &connection->transfer_log, tl_requests)
+ if (req->epoch == expect_epoch)
+ break;
+ list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
}
-
- spin_unlock_irq(&mdev->req_lock);
- dec_ap_pending(mdev);
+ spin_unlock_irq(&connection->resource->req_lock);
return;
bail:
- spin_unlock_irq(&mdev->req_lock);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ spin_unlock_irq(&connection->resource->req_lock);
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
+
/**
* _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @what: The action/event to perform with all request objects
*
- * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io.
- */
-static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
- struct drbd_tl_epoch *b, *tmp, **pn;
- struct list_head *le, *tle, carry_reads;
- struct drbd_request *req;
- int rv, n_writes, n_reads;
-
- b = mdev->oldest_tle;
- pn = &mdev->oldest_tle;
- while (b) {
- n_writes = 0;
- n_reads = 0;
- INIT_LIST_HEAD(&carry_reads);
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- rv = _req_mod(req, what);
-
- n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
- n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
- }
- tmp = b->next;
-
- if (n_writes) {
- if (what == resend) {
- b->n_writes = n_writes;
- if (b->w.cb == NULL) {
- b->w.cb = w_send_barrier;
- inc_ap_pending(mdev);
- set_bit(CREATE_BARRIER, &mdev->flags);
- }
-
- drbd_queue_work(&mdev->data.work, &b->w);
- }
- pn = &b->next;
- } else {
- if (n_reads)
- list_add(&carry_reads, &b->requests);
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- list_splice(&carry_reads, &b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = net_random();
- b->n_writes = 0;
-
- *pn = b;
- break;
- }
- *pn = tmp;
- kfree(b);
- }
- b = tmp;
- list_splice(&carry_reads, &b->requests);
- }
-}
-
-
-/**
- * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
- * @mdev: DRBD device.
- *
- * This is called after the connection to the peer was lost. The storage covered
- * by the requests on the transfer gets marked as our of sync. Called from the
- * receiver thread and the worker thread.
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
*/
-void tl_clear(struct drbd_conf *mdev)
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
{
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- spin_lock_irq(&mdev->req_lock);
-
- _tl_restart(mdev, connection_lost_while_pending);
-
- /* we expect this list to be empty. */
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-
- /* but just in case, clean it up anyways! */
- list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
-
- /* ensure bit indicating barrier is required is clear */
- clear_bit(CREATE_BARRIER, &mdev->flags);
-
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-
- spin_unlock_irq(&mdev->req_lock);
-}
-
-void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- spin_unlock_irq(&mdev->req_lock);
-}
-
-/**
- * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
- * @mdev: DRBD device.
- * @os: old (current) state.
- * @ns: new (wanted) state.
- */
-static int cl_wide_st_chg(struct drbd_conf *mdev,
- union drbd_state os, union drbd_state ns)
-{
- return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
- ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
- (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
- (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
- (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
-}
-
-int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
- union drbd_state mask, union drbd_state val)
-{
- unsigned long flags;
- union drbd_state os, ns;
- int rv;
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, NULL);
- ns = mdev->state;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- */
-void drbd_force_state(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
-{
- drbd_change_state(mdev, CS_HARD, mask, val);
-}
-
-static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
-static int is_valid_state_transition(struct drbd_conf *,
- union drbd_state, union drbd_state);
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, const char **warn_sync_abort);
-int drbd_send_state_req(struct drbd_conf *,
- union drbd_state, union drbd_state);
-
-static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
-{
- union drbd_state os, ns;
- unsigned long flags;
- int rv;
-
- if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
- return SS_CW_SUCCESS;
-
- if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
- return SS_CW_FAILED_BY_PEER;
-
- rv = 0;
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (!cl_wide_st_chg(mdev, os, ns))
- rv = SS_CW_NO_NEED;
- if (!rv) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS) {
- rv = is_valid_state_transition(mdev, ns, os);
- if (rv == SS_SUCCESS)
- rv = 0; /* cont waiting, otherwise fail. */
- }
- }
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
-static int drbd_req_state(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val,
- enum chg_state_flags f)
-{
- struct completion done;
- unsigned long flags;
- union drbd_state os, ns;
- int rv;
-
- init_completion(&done);
-
- if (f & CS_SERIALIZE)
- mutex_lock(&mdev->state_mutex);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (cl_wide_st_chg(mdev, os, ns)) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS)
- rv = is_valid_state_transition(mdev, ns, os);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (rv < SS_SUCCESS) {
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- drbd_state_lock(mdev);
- if (!drbd_send_state_req(mdev, mask, val)) {
- drbd_state_unlock(mdev);
- rv = SS_CW_FAILED_BY_PEER;
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- wait_event(mdev->state_wait,
- (rv = _req_st_cond(mdev, mask, val)));
-
- if (rv < SS_SUCCESS) {
- drbd_state_unlock(mdev);
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, &done);
- drbd_state_unlock(mdev);
- } else {
- rv = _drbd_set_state(mdev, ns, f, &done);
- }
-
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
- D_ASSERT(current != mdev->worker.task);
- wait_for_completion(&done);
- }
-
-abort:
- if (f & CS_SERIALIZE)
- mutex_unlock(&mdev->state_mutex);
-
- return rv;
-}
-
-/**
- * _drbd_request_state() - Request a state change (with flags)
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
-int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
-{
- int rv;
-
- wait_event(mdev->state_wait,
- (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-
- return rv;
-}
-
-static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
-{
- dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
- name,
- drbd_conn_str(ns.conn),
- drbd_role_str(ns.role),
- drbd_role_str(ns.peer),
- drbd_disk_str(ns.disk),
- drbd_disk_str(ns.pdsk),
- is_susp(ns) ? 's' : 'r',
- ns.aftr_isp ? 'a' : '-',
- ns.peer_isp ? 'p' : '-',
- ns.user_isp ? 'u' : '-'
- );
-}
-
-void print_st_err(struct drbd_conf *mdev,
- union drbd_state os, union drbd_state ns, int err)
-{
- if (err == SS_IN_TRANSIENT_STATE)
- return;
- dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
- print_st(mdev, " state", os);
- print_st(mdev, "wanted", ns);
-}
-
-
-#define drbd_peer_str drbd_role_str
-#define drbd_pdsk_str drbd_disk_str
-
-#define drbd_susp_str(A) ((A) ? "1" : "0")
-#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
-#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
-#define drbd_user_isp_str(A) ((A) ? "1" : "0")
-
-#define PSC(A) \
- ({ if (ns.A != os.A) { \
- pbp += sprintf(pbp, #A "( %s -> %s ) ", \
- drbd_##A##_str(os.A), \
- drbd_##A##_str(ns.A)); \
- } })
-
-/**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @mdev: DRBD device.
- * @ns: State to consider.
- */
-static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
-{
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- enum drbd_fencing_p fp;
- int rv = SS_SUCCESS;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (get_net_conf(mdev)) {
- if (!mdev->net_conf->two_primaries &&
- ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
- rv = SS_TWO_PRIMARIES;
- put_net_conf(mdev);
- }
-
- if (rv <= 0)
- /* already found a reason to abort */;
- else if (ns.role == R_SECONDARY && mdev->open_cnt)
- rv = SS_DEVICE_IN_USE;
-
- else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (fp >= FP_RESOURCE &&
- ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
- rv = SS_PRIMARY_NOP;
-
- else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
- rv = SS_NO_LOCAL_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
- rv = SS_NO_REMOTE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if ((ns.conn == C_CONNECTED ||
- ns.conn == C_WF_BITMAP_S ||
- ns.conn == C_SYNC_SOURCE ||
- ns.conn == C_PAUSED_SYNC_S) &&
- ns.disk == D_OUTDATED)
- rv = SS_CONNECTED_OUTDATES;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- (mdev->sync_conf.verify_alg[0] == 0))
- rv = SS_NO_VERIFY_ALG;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- mdev->agreed_pro_version < 88)
- rv = SS_NOT_SUPPORTED;
-
- return rv;
-}
-
-/**
- * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
- * @mdev: DRBD device.
- * @ns: new state.
- * @os: old state.
- */
-static int is_valid_state_transition(struct drbd_conf *mdev,
- union drbd_state ns, union drbd_state os)
-{
- int rv = SS_SUCCESS;
-
- if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
- os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
- rv = SS_ALREADY_STANDALONE;
-
- if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
- rv = SS_IS_DISKLESS;
-
- if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
- rv = SS_NO_NET_CONFIG;
-
- if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
- rv = SS_LOWER_THAN_OUTDATED;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
- rv = SS_IN_TRANSIENT_STATE;
-
- if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
- rv = SS_IN_TRANSIENT_STATE;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- ns.conn != os.conn && os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
- os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- return rv;
-}
-
-/**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @warn_sync_abort:
- *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
- */
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, const char **warn_sync_abort)
-{
- enum drbd_fencing_p fp;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Disallow Network errors to configure a device's network part */
- if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
- os.conn <= C_DISCONNECTING)
- ns.conn = os.conn;
-
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
- * If you try to go into some Sync* state, that shall fail (elsewhere). */
- if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
- ns.conn = os.conn;
-
- /* we cannot fail (again) if we already detached */
- if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
- ns.disk = D_DISKLESS;
-
- /* if we are only D_ATTACHING yet,
- * we can (and should) go directly to D_DISKLESS. */
- if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
- ns.disk = D_DISKLESS;
-
- /* After C_DISCONNECTING only C_STANDALONE may follow */
- if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
- ns.conn = os.conn;
-
- if (ns.conn < C_CONNECTED) {
- ns.peer_isp = 0;
- ns.peer = R_UNKNOWN;
- if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
- ns.pdsk = D_UNKNOWN;
- }
-
- /* Clear the aftr_isp when becoming unconfigured */
- if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
- ns.aftr_isp = 0;
-
- /* Abort resync if a disk fails/detaches */
- if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
- (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
- if (warn_sync_abort)
- *warn_sync_abort =
- os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
- "Online-verify" : "Resync";
- ns.conn = C_CONNECTED;
- }
-
- if (ns.conn >= C_CONNECTED &&
- ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
- (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
- switch (ns.conn) {
- case C_WF_BITMAP_T:
- case C_PAUSED_SYNC_T:
- ns.disk = D_OUTDATED;
- break;
- case C_CONNECTED:
- case C_WF_BITMAP_S:
- case C_SYNC_SOURCE:
- case C_PAUSED_SYNC_S:
- ns.disk = D_UP_TO_DATE;
- break;
- case C_SYNC_TARGET:
- ns.disk = D_INCONSISTENT;
- dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
- break;
- }
- if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
- dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
- }
-
- if (ns.conn >= C_CONNECTED &&
- (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
- switch (ns.conn) {
- case C_CONNECTED:
- case C_WF_BITMAP_T:
- case C_PAUSED_SYNC_T:
- case C_SYNC_TARGET:
- ns.pdsk = D_UP_TO_DATE;
- break;
- case C_WF_BITMAP_S:
- case C_PAUSED_SYNC_S:
- /* remap any consistent state to D_OUTDATED,
- * but disallow "upgrade" of not even consistent states.
- */
- ns.pdsk =
- (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
- ? os.pdsk : D_OUTDATED;
- break;
- case C_SYNC_SOURCE:
- ns.pdsk = D_INCONSISTENT;
- dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
- break;
- }
- if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
- dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
- }
-
- /* Connection breaks down before we finished "Negotiating" */
- if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
- get_ldev_if_state(mdev, D_NEGOTIATING)) {
- if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
- ns.disk = mdev->new_state_tmp.disk;
- ns.pdsk = mdev->new_state_tmp.pdsk;
- } else {
- dev_alert(DEV, "Connection lost while negotiating, no data!\n");
- ns.disk = D_DISKLESS;
- ns.pdsk = D_UNKNOWN;
- }
- put_ldev(mdev);
- }
-
- if (fp == FP_STONITH &&
- (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
- !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
- if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
- !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
- ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-
- if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
- if (ns.conn == C_SYNC_SOURCE)
- ns.conn = C_PAUSED_SYNC_S;
- if (ns.conn == C_SYNC_TARGET)
- ns.conn = C_PAUSED_SYNC_T;
- } else {
- if (ns.conn == C_PAUSED_SYNC_S)
- ns.conn = C_SYNC_SOURCE;
- if (ns.conn == C_PAUSED_SYNC_T)
- ns.conn = C_SYNC_TARGET;
- }
+ struct drbd_request *req, *r;
- return ns;
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+ _req_mod(req, what);
}
-/* helper for __drbd_set_state */
-static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
{
- if (cs == C_VERIFY_T) {
- /* starting online verify from an arbitrary position
- * does not fit well into the existing protocol.
- * on C_VERIFY_T, we initialize ov_left and friends
- * implicitly in receive_DataRequest once the
- * first P_OV_REQUEST is received */
- mdev->ov_start_sector = ~(sector_t)0;
- } else {
- unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
- if (bit >= mdev->rs_total)
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(mdev->rs_total - 1);
- mdev->ov_position = mdev->ov_start_sector;
- }
-}
-
-static void drbd_resume_al(struct drbd_conf *mdev)
-{
- if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
- dev_info(DEV, "Resumed AL updates\n");
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, what);
+ spin_unlock_irq(&connection->resource->req_lock);
}
/**
- * __drbd_set_state() - Set a new DRBD state
- * @mdev: DRBD device.
- * @ns: new state.
- * @flags: Flags
- * @done: Optional completion, that will get completed after the after_state_ch() finished
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @device: DRBD device.
*
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
*/
-int __drbd_set_state(struct drbd_conf *mdev,
- union drbd_state ns, enum chg_state_flags flags,
- struct completion *done)
+void tl_clear(struct drbd_connection *connection)
{
- union drbd_state os;
- int rv = SS_SUCCESS;
- const char *warn_sync_abort = NULL;
- struct after_state_chg_work *ascw;
-
- os = mdev->state;
-
- ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
-
- if (ns.i == os.i)
- return SS_NOTHING_TO_DO;
-
- if (!(flags & CS_HARD)) {
- /* pre-state-change checks ; only look at ns */
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- rv = is_valid_state(mdev, ns);
- if (rv < SS_SUCCESS) {
- /* If the old state was illegal as well, then let
- this happen...*/
-
- if (is_valid_state(mdev, os) == rv)
- rv = is_valid_state_transition(mdev, ns, os);
- } else
- rv = is_valid_state_transition(mdev, ns, os);
- }
-
- if (rv < SS_SUCCESS) {
- if (flags & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- return rv;
- }
-
- if (warn_sync_abort)
- dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
-
- {
- char *pbp, pb[300];
- pbp = pb;
- *pbp = 0;
- PSC(role);
- PSC(peer);
- PSC(conn);
- PSC(disk);
- PSC(pdsk);
- if (is_susp(ns) != is_susp(os))
- pbp += sprintf(pbp, "susp( %s -> %s ) ",
- drbd_susp_str(is_susp(os)),
- drbd_susp_str(is_susp(ns)));
- PSC(aftr_isp);
- PSC(peer_isp);
- PSC(user_isp);
- dev_info(DEV, "%s\n", pb);
- }
-
- /* solve the race between becoming unconfigured,
- * worker doing the cleanup, and
- * admin reconfiguring us:
- * on (re)configure, first set CONFIG_PENDING,
- * then wait for a potentially exiting worker,
- * start the worker, and schedule one no_op.
- * then proceed with configuration.
- */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY &&
- !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
- set_bit(DEVICE_DYING, &mdev->flags);
-
- /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
- * on the ldev here, to be sure the transition -> D_DISKLESS resp.
- * drbd_ldev_destroy() won't happen before our corresponding
- * after_state_ch works run, where we put_ldev again. */
- if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
- atomic_inc(&mdev->local_cnt);
-
- mdev->state = ns;
- wake_up(&mdev->misc_wait);
- wake_up(&mdev->state_wait);
-
- /* aborted verify run. log the last position */
- if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
- ns.conn < C_CONNECTED) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
- dev_info(DEV, "Online Verify reached sector %llu\n",
- (unsigned long long)mdev->ov_start_sector);
- }
-
- if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
- dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies
- -(long)mdev->rs_mark_time[mdev->rs_last_mark];
- if (ns.conn == C_SYNC_TARGET)
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
- if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
- (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
- dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
- }
-
- if (os.conn == C_CONNECTED &&
- (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
- unsigned long now = jiffies;
- int i;
-
- mdev->ov_position = 0;
- mdev->rs_total = drbd_bm_bits(mdev);
- if (mdev->agreed_pro_version >= 90)
- set_ov_position(mdev, ns.conn);
- else
- mdev->ov_start_sector = 0;
- mdev->ov_left = mdev->rs_total
- - BM_SECT_TO_BIT(mdev->ov_position);
- mdev->rs_start = now;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
- mdev->ov_last_oos_size = 0;
- mdev->ov_last_oos_start = 0;
-
- for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = mdev->rs_total;
- mdev->rs_mark_time[i] = now;
- }
-
- if (ns.conn == C_VERIFY_S) {
- dev_info(DEV, "Starting Online Verify from sector %llu\n",
- (unsigned long long)mdev->ov_position);
- mod_timer(&mdev->resync_timer, jiffies);
- }
- }
-
- if (get_ldev(mdev)) {
- u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
- MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
- MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
- if (test_bit(CRASHED_PRIMARY, &mdev->flags))
- mdf |= MDF_CRASHED_PRIMARY;
- if (mdev->state.role == R_PRIMARY ||
- (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
- mdf |= MDF_PRIMARY_IND;
- if (mdev->state.conn > C_WF_REPORT_PARAMS)
- mdf |= MDF_CONNECTED_IND;
- if (mdev->state.disk > D_INCONSISTENT)
- mdf |= MDF_CONSISTENT;
- if (mdev->state.disk > D_OUTDATED)
- mdf |= MDF_WAS_UP_TO_DATE;
- if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
- mdf |= MDF_PEER_OUT_DATED;
- if (mdf != mdev->ldev->md.flags) {
- mdev->ldev->md.flags = mdf;
- drbd_md_mark_dirty(mdev);
- }
- if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
- drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
- put_ldev(mdev);
- }
-
- /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
- if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
- os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
- set_bit(CONSIDER_RESYNC, &mdev->flags);
-
- /* Receiver should clean up itself */
- if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Now the receiver finished cleaning up itself, it should die */
- if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_TEAR_DOWN &&
- ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
- drbd_thread_restart_nowait(&mdev->receiver);
-
- /* Resume AL writing if we get a connection */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- drbd_resume_al(mdev);
-
- ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
- if (ascw) {
- ascw->os = os;
- ascw->ns = ns;
- ascw->flags = flags;
- ascw->w.cb = w_after_state_ch;
- ascw->done = done;
- drbd_queue_work(&mdev->data.work, &ascw->w);
- } else {
- dev_warn(DEV, "Could not kmalloc an ascw\n");
- }
-
- return rv;
-}
-
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
-{
- struct after_state_chg_work *ascw =
- container_of(w, struct after_state_chg_work, w);
- after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
- if (ascw->flags & CS_WAIT_COMPLETE) {
- D_ASSERT(ascw->done != NULL);
- complete(ascw->done);
- }
- kfree(ascw);
-
- return 1;
-}
-
-static void abw_start_sync(struct drbd_conf *mdev, int rv)
-{
- if (rv) {
- dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
- _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
- return;
- }
-
- switch (mdev->state.conn) {
- case C_STARTING_SYNC_T:
- _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
- break;
- case C_STARTING_SYNC_S:
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- break;
- }
+ tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
}
/**
- * after_state_ch() - Perform after state change actions that may sleep
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @flags: Flags
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device: DRBD device.
*/
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags)
+void tl_abort_disk_io(struct drbd_device *device)
{
- enum drbd_fencing_p fp;
- enum drbd_req_event what = nothing;
- union drbd_state nsm = (union drbd_state){ .i = -1 };
-
- if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
- clear_bit(CRASHED_PRIMARY, &mdev->flags);
- if (mdev->p_uuid)
- mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
- }
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Inform userspace about the change... */
- drbd_bcast_state(mdev, ns);
-
- if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
- drbd_khelper(mdev, "pri-on-incon-degr");
-
- /* Here we have the actions that are performed after a
- state change. This function might sleep */
-
- nsm.i = -1;
- if (ns.susp_nod) {
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
- if (ns.conn == C_CONNECTED)
- what = resend, nsm.susp_nod = 0;
- else /* ns.conn > C_CONNECTED */
- dev_err(DEV, "Unexpected Resynd going on!\n");
- }
-
- if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
- what = restart_frozen_disk_io, nsm.susp_nod = 0;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req, *r;
+ spin_lock_irq(&connection->resource->req_lock);
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ if (req->device != device)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
}
-
- if (ns.susp_fen) {
- /* case1: The outdate peer handler is successful: */
- if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
- tl_clear(mdev);
- if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
- drbd_uuid_new_current(mdev);
- clear_bit(NEW_CUR_UUID, &mdev->flags);
- }
- spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
- /* case2: The connection was established again: */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
- clear_bit(NEW_CUR_UUID, &mdev->flags);
- what = resend;
- nsm.susp_fen = 0;
- }
- }
-
- if (what != nothing) {
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- nsm.i &= mdev->state.i;
- _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
-
- /* Do not change the order of the if above and the two below... */
- if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev);
- }
- if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
- drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
-
- /* Lost contact to peer's copy of the data */
- if ((os.pdsk >= D_INCONSISTENT &&
- os.pdsk != D_UNKNOWN &&
- os.pdsk != D_OUTDATED)
- && (ns.pdsk < D_INCONSISTENT ||
- ns.pdsk == D_UNKNOWN ||
- ns.pdsk == D_OUTDATED)) {
- if (get_ldev(mdev)) {
- if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- if (is_susp(mdev->state)) {
- set_bit(NEW_CUR_UUID, &mdev->flags);
- } else {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- }
- put_ldev(mdev);
- }
- }
-
- if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
-
- /* D_DISKLESS Peer becomes secondary */
- if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
- drbd_al_to_on_disk_bm(mdev);
- put_ldev(mdev);
- }
-
- /* Last part of the attaching process ... */
- if (ns.conn >= C_CONNECTED &&
- os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
- drbd_send_sizes(mdev, 0, 0); /* to start sync... */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev);
- }
-
- /* We want to pause/continue resync, tell peer. */
- if (ns.conn >= C_CONNECTED &&
- ((os.aftr_isp != ns.aftr_isp) ||
- (os.user_isp != ns.user_isp)))
- drbd_send_state(mdev);
-
- /* In case one of the isp bits got set, suspend other devices. */
- if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
- (ns.aftr_isp || ns.peer_isp || ns.user_isp))
- suspend_other_sg(mdev);
-
- /* Make sure the peer gets informed about eventual state
- changes (ISP bits) while we were in WFReportParams. */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev);
-
- /* We are in the progress to start a full sync... */
- if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
-
- /* We are invalidating our self... */
- if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
- os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
-
- /* first half of local IO error, failure to attach,
- * or administrative detach */
- if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- enum drbd_io_error_p eh;
- int was_io_error;
- /* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS,
- * so it is safe to dreference ldev here. */
- eh = mdev->ldev->dc.on_io_error;
- was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that I am detaching my disk\n");
- else
- dev_err(DEV, "Sending state for detaching disk failed\n");
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
- put_ldev(mdev);
-
- if (was_io_error && eh == EP_CALL_HELPER)
- drbd_khelper(mdev, "local-io-error");
- }
-
- /* second half of local IO error, failure to attach,
- * or administrative detach,
- * after local_cnt references have reached zero again */
- if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
- /* We must still be diskless,
- * re-attach has to be serialized with this! */
- if (mdev->state.disk != D_DISKLESS)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s while going diskless\n",
- drbd_disk_str(mdev->state.disk));
-
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
-
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that I'm now diskless.\n");
- else
- dev_err(DEV, "Sending state for being diskless failed\n");
- /* corresponding get_ldev in __drbd_set_state
- * this may finaly trigger drbd_ldev_destroy. */
- put_ldev(mdev);
- }
-
- /* Disks got bigger while they were detached */
- if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
- test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
- if (ns.conn == C_CONNECTED)
- resync_after_online_grow(mdev);
- }
-
- /* A resync finished or aborted, wake paused devices... */
- if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
- (os.peer_isp && !ns.peer_isp) ||
- (os.user_isp && !ns.user_isp))
- resume_next_sg(mdev);
-
- /* sync target done with resync. Explicitly notify peer, even though
- * it should (at least for non-empty resyncs) already know itself. */
- if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev);
-
- /* free tl_hash if we Got thawed and are C_STANDALONE */
- if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
- drbd_free_tl_hash(mdev);
-
- /* Upon network connection, we need to start the receiver */
- if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
- drbd_thread_start(&mdev->receiver);
-
- /* Terminate worker thread if we are unconfigured - it will be
- restarted as needed... */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY) {
- if (os.aftr_isp != ns.aftr_isp)
- resume_next_sg(mdev);
- /* set in __drbd_set_state, unless CONFIG_PENDING was set */
- if (test_bit(DEVICE_DYING, &mdev->flags))
- drbd_thread_stop_nowait(&mdev->worker);
- }
-
- drbd_md_sync(mdev);
+ spin_unlock_irq(&connection->resource->req_lock);
}
-
static int drbd_thread_setup(void *arg)
{
struct drbd_thread *thi = (struct drbd_thread *) arg;
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_resource *resource = thi->resource;
unsigned long flags;
int retval;
+ snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+ thi->name[0],
+ resource->name);
+
restart:
retval = thi->function(thi);
spin_lock_irqsave(&thi->t_lock, flags);
- /* if the receiver has been "Exiting", the last thing it did
+ /* if the receiver has been "EXITING", the last thing it did
* was set the conn state to "StandAlone",
* if now a re-connect request comes in, conn state goes C_UNCONNECTED,
* and receiver thread will be "started".
- * drbd_thread_start needs to set "Restarting" in that case.
+ * drbd_thread_start needs to set "RESTARTING" in that case.
* t_state check and assignment needs to be within the same spinlock,
- * so either thread_start sees Exiting, and can remap to Restarting,
- * or thread_start see None, and can proceed as normal.
+ * so either thread_start sees EXITING, and can remap to RESTARTING,
+ * or thread_start see NONE, and can proceed as normal.
*/
- if (thi->t_state == Restarting) {
- dev_info(DEV, "Restarting %s\n", current->comm);
- thi->t_state = Running;
+ if (thi->t_state == RESTARTING) {
+ drbd_info(resource, "Restarting %s thread\n", thi->name);
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
goto restart;
}
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
smp_mb();
- complete(&thi->stop);
+ complete_all(&thi->stop);
spin_unlock_irqrestore(&thi->t_lock, flags);
- dev_info(DEV, "Terminating %s\n", current->comm);
+ drbd_info(resource, "Terminating %s\n", current->comm);
/* Release mod reference taken when thread was started */
+
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
module_put(THIS_MODULE);
return retval;
}
-static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
- int (*func) (struct drbd_thread *))
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *), const char *name)
{
spin_lock_init(&thi->t_lock);
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
thi->function = func;
- thi->mdev = mdev;
+ thi->resource = resource;
+ thi->connection = NULL;
+ thi->name = name;
}
int drbd_thread_start(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_resource *resource = thi->resource;
struct task_struct *nt;
unsigned long flags;
- const char *me =
- thi == &mdev->receiver ? "receiver" :
- thi == &mdev->asender ? "asender" :
- thi == &mdev->worker ? "worker" : "NONSENSE";
-
/* is used from state engine doing drbd_thread_stop_nowait,
* while holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
switch (thi->t_state) {
- case None:
- dev_info(DEV, "Starting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case NONE:
+ drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* Get ref on module for thread - this is released when thread exits */
if (!try_module_get(THIS_MODULE)) {
- dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+ drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
spin_unlock_irqrestore(&thi->t_lock, flags);
- return FALSE;
+ return false;
}
+ kref_get(&resource->kref);
+ if (thi->connection)
+ kref_get(&thi->connection->kref);
+
init_completion(&thi->stop);
- D_ASSERT(thi->task == NULL);
thi->reset_cpu_mask = 1;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
nt = kthread_create(drbd_thread_setup, (void *) thi,
- "drbd%d_%s", mdev_to_minor(mdev), me);
+ "drbd_%c_%s", thi->name[0], thi->resource->name);
if (IS_ERR(nt)) {
- dev_err(DEV, "Couldn't start thread\n");
+ drbd_err(resource, "Couldn't start thread\n");
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
module_put(THIS_MODULE);
- return FALSE;
+ return false;
}
spin_lock_irqsave(&thi->t_lock, flags);
thi->task = nt;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
wake_up_process(nt);
break;
- case Exiting:
- thi->t_state = Restarting;
- dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case EXITING:
+ thi->t_state = RESTARTING;
+ drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* fall through */
- case Running:
- case Restarting:
+ case RUNNING:
+ case RESTARTING:
default:
spin_unlock_irqrestore(&thi->t_lock, flags);
break;
}
- return TRUE;
+ return true;
}
@@ -1604,12 +451,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
{
unsigned long flags;
- enum drbd_thread_state ns = restart ? Restarting : Exiting;
+ enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
/* may be called from state engine, holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
- if (thi->t_state == None) {
+ if (thi->t_state == NONE) {
spin_unlock_irqrestore(&thi->t_lock, flags);
if (restart)
drbd_thread_start(thi);
@@ -1627,7 +474,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
init_completion(&thi->stop);
if (thi->task != current)
force_sig(DRBD_SIGKILL, thi->task);
-
}
spin_unlock_irqrestore(&thi->t_lock, flags);
@@ -1636,355 +482,600 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
wait_for_completion(&thi->stop);
}
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr = 0, minor = -1;
+
+ rcu_read_lock();
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ if (peer_device)
+ minor = device_to_minor(peer_device->device);
+ rcu_read_unlock();
+
+ return minor;
+}
+
#ifdef CONFIG_SMP
/**
* drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
- * @mdev: DRBD device.
*
- * Forces all threads of a device onto the same CPU. This is beneficial for
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
* DRBD's performance. May be overwritten by user's configuration.
*/
-void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
{
- int ord, cpu;
+ unsigned int *resources_per_cpu, min_index = ~0;
- /* user override. */
- if (cpumask_weight(mdev->cpu_mask))
- return;
+ resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
+ if (resources_per_cpu) {
+ struct drbd_resource *resource;
+ unsigned int cpu, min = ~0;
- ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
- for_each_online_cpu(cpu) {
- if (ord-- == 0) {
- cpumask_set_cpu(cpu, mdev->cpu_mask);
- return;
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_cpu(cpu, resource->cpu_mask)
+ resources_per_cpu[cpu]++;
+ }
+ rcu_read_unlock();
+ for_each_online_cpu(cpu) {
+ if (resources_per_cpu[cpu] < min) {
+ min = resources_per_cpu[cpu];
+ min_index = cpu;
+ }
}
+ kfree(resources_per_cpu);
}
- /* should not be reached */
- cpumask_setall(mdev->cpu_mask);
+ if (min_index == ~0) {
+ cpumask_setall(*cpu_mask);
+ return;
+ }
+ cpumask_set_cpu(min_index, *cpu_mask);
}
/**
* drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
- * @mdev: DRBD device.
+ * @device: DRBD device.
+ * @thi: drbd_thread object
*
* call in the "main loop" of _all_ threads, no need for any mutex, current won't die
* prematurely.
*/
-void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
{
+ struct drbd_resource *resource = thi->resource;
struct task_struct *p = current;
- struct drbd_thread *thi =
- p == mdev->asender.task ? &mdev->asender :
- p == mdev->receiver.task ? &mdev->receiver :
- p == mdev->worker.task ? &mdev->worker :
- NULL;
- ERR_IF(thi == NULL)
- return;
+
if (!thi->reset_cpu_mask)
return;
thi->reset_cpu_mask = 0;
- set_cpus_allowed_ptr(p, mdev->cpu_mask);
+ set_cpus_allowed_ptr(p, resource->cpu_mask);
}
+#else
+#define drbd_calc_cpu_mask(A) ({})
#endif
-/* the appropriate socket mutex must be held already */
-int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags)
+/**
+ * drbd_header_size - size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures. (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
{
- int sent, ok;
+ if (connection->agreed_pro_version >= 100) {
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+ return sizeof(struct p_header100);
+ } else {
+ BUILD_BUG_ON(sizeof(struct p_header80) !=
+ sizeof(struct p_header95));
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+ return sizeof(struct p_header80);
+ }
+}
- ERR_IF(!h) return FALSE;
- ERR_IF(!size) return FALSE;
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size);
+ return sizeof(struct p_header80);
+}
- h->magic = BE_DRBD_MAGIC;
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
h->command = cpu_to_be16(cmd);
- h->length = cpu_to_be16(size-sizeof(struct p_header80));
+ h->length = cpu_to_be32(size);
+ return sizeof(struct p_header95);
+}
- sent = drbd_send(mdev, sock, h, size, msg_flags);
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+ int size, int vnr)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC_100);
+ h->volume = cpu_to_be16(vnr);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ h->pad = 0;
+ return sizeof(struct p_header100);
+}
- ok = (sent == size);
- if (!ok)
- dev_err(DEV, "short sent %s size=%d sent=%d\n",
- cmdname(cmd), (int)size, sent);
- return ok;
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+ void *buffer, enum drbd_packet cmd, int size)
+{
+ if (connection->agreed_pro_version >= 100)
+ return prepare_header100(buffer, cmd, size, vnr);
+ else if (connection->agreed_pro_version >= 95 &&
+ size > DRBD_MAX_SIZE_H80_PACKET)
+ return prepare_header95(buffer, cmd, size);
+ else
+ return prepare_header80(buffer, cmd, size);
}
-/* don't pass the socket. we may only look at it
- * when we hold the appropriate socket mutex.
- */
-int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h, size_t size)
+static void *__conn_prepare_command(struct drbd_connection *connection,
+ struct drbd_socket *sock)
{
- int ok = 0;
- struct socket *sock;
+ if (!sock->socket)
+ return NULL;
+ return sock->sbuf + drbd_header_size(connection);
+}
- if (use_data_socket) {
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
- } else {
- mutex_lock(&mdev->meta.mutex);
- sock = mdev->meta.socket;
- }
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+ void *p;
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (likely(sock != NULL))
- ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+ mutex_lock(&sock->mutex);
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ mutex_unlock(&sock->mutex);
- if (use_data_socket)
- mutex_unlock(&mdev->data.mutex);
- else
- mutex_unlock(&mdev->meta.mutex);
- return ok;
+ return p;
}
-int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
- size_t size)
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
{
- struct p_header80 h;
- int ok;
+ return conn_prepare_command(peer_device->connection, sock);
+}
- h.magic = BE_DRBD_MAGIC;
- h.command = cpu_to_be16(cmd);
- h.length = cpu_to_be16(size);
+static int __send_command(struct drbd_connection *connection, int vnr,
+ struct drbd_socket *sock, enum drbd_packet cmd,
+ unsigned int header_size, void *data,
+ unsigned int size)
+{
+ int msg_flags;
+ int err;
- if (!drbd_get_data_sock(mdev))
- return 0;
+ /*
+ * Called with @data == NULL and the size of the data blocks in @size
+ * for commands that send data blocks. For those commands, omit the
+ * MSG_MORE flag: this will increase the likelihood that data blocks
+ * which are page aligned on the sender will end up page aligned on the
+ * receiver.
+ */
+ msg_flags = data ? MSG_MORE : 0;
+
+ header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+ header_size + size);
+ err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+ msg_flags);
+ if (data && !err)
+ err = drbd_send_all(connection, sock->socket, data, size, 0);
+ return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
- ok = (sizeof(h) ==
- drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
- ok = ok && (size ==
- drbd_send(mdev, mdev->data.socket, data, size, 0));
+ err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
- drbd_put_data_sock(mdev);
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
- return ok;
+ err = __send_command(peer_device->connection, peer_device->device->vnr,
+ sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
}
-int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+int drbd_send_ping(struct drbd_connection *connection)
{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+ struct drbd_socket *sock;
struct p_rs_param_95 *p;
- struct socket *sock;
- int size, rv;
- const int apv = mdev->agreed_pro_version;
+ int size;
+ const int apv = peer_device->connection->agreed_pro_version;
+ enum drbd_packet cmd;
+ struct net_conf *nc;
+ struct disk_conf *dc;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
- + strlen(mdev->sync_conf.verify_alg) + 1
+ + strlen(nc->verify_alg) + 1
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- /* used from admin command context and receiver/worker context.
- * to avoid kmalloc, grab the socket right here,
- * then use the pre-allocated sbuf there */
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
-
- if (likely(sock != NULL)) {
- enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
-
- p = &mdev->data.sbuf.rs_param_95;
-
- /* initialize verify_alg and csums_alg */
- memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+ cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
- p->rate = cpu_to_be32(sc->rate);
- p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
- p->c_delay_target = cpu_to_be32(sc->c_delay_target);
- p->c_fill_target = cpu_to_be32(sc->c_fill_target);
- p->c_max_rate = cpu_to_be32(sc->c_max_rate);
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (apv >= 88)
- strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
- if (apv >= 89)
- strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
-
- rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
- } else
- rv = 0; /* not ok */
+ if (get_ldev(peer_device->device)) {
+ dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+ p->resync_rate = cpu_to_be32(dc->resync_rate);
+ p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+ put_ldev(peer_device->device);
+ } else {
+ p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+ p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+ p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+ p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+ p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+ }
- mutex_unlock(&mdev->data.mutex);
+ if (apv >= 88)
+ strcpy(p->verify_alg, nc->verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, nc->csums_alg);
+ rcu_read_unlock();
- return rv;
+ return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
}
-int drbd_send_protocol(struct drbd_conf *mdev)
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
{
+ struct drbd_socket *sock;
struct p_protocol *p;
- int size, cf, rv;
+ struct net_conf *nc;
+ int size, cf;
- size = sizeof(struct p_protocol);
+ sock = &connection->data;
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
- if (mdev->agreed_pro_version >= 87)
- size += strlen(mdev->net_conf->integrity_alg) + 1;
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
- /* we must not recurse into our own queue,
- * as that is blocked during handshake */
- p = kmalloc(size, GFP_NOIO);
- if (p == NULL)
- return 0;
+ if (nc->tentative && connection->agreed_pro_version < 92) {
+ rcu_read_unlock();
+ mutex_unlock(&sock->mutex);
+ drbd_err(connection, "--dry-run is not supported by peer");
+ return -EOPNOTSUPP;
+ }
- p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
- p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
- p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
- p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
- p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+ size = sizeof(*p);
+ if (connection->agreed_pro_version >= 87)
+ size += strlen(nc->integrity_alg) + 1;
+ p->protocol = cpu_to_be32(nc->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
+ p->two_primaries = cpu_to_be32(nc->two_primaries);
cf = 0;
- if (mdev->net_conf->want_lose)
- cf |= CF_WANT_LOSE;
- if (mdev->net_conf->dry_run) {
- if (mdev->agreed_pro_version >= 92)
- cf |= CF_DRY_RUN;
- else {
- dev_err(DEV, "--dry-run is not supported by peer");
- kfree(p);
- return 0;
- }
- }
+ if (nc->discard_my_data)
+ cf |= CF_DISCARD_MY_DATA;
+ if (nc->tentative)
+ cf |= CF_DRY_RUN;
p->conn_flags = cpu_to_be32(cf);
- if (mdev->agreed_pro_version >= 87)
- strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+ if (connection->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, nc->integrity_alg);
+ rcu_read_unlock();
- rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
- (struct p_header80 *)p, size);
- kfree(p);
- return rv;
+ return __conn_send_command(connection, sock, cmd, size, NULL, 0);
}
-int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
+int drbd_send_protocol(struct drbd_connection *connection)
{
- struct p_uuids p;
+ int err;
+
+ mutex_lock(&connection->data.mutex);
+ err = __drbd_send_protocol(connection, P_PROTOCOL);
+ mutex_unlock(&connection->data.mutex);
+
+ return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_uuids *p;
int i;
- if (!get_ldev_if_state(mdev, D_NEGOTIATING))
- return 1;
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
+ return 0;
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p) {
+ put_ldev(device);
+ return -EIO;
+ }
+ spin_lock_irq(&device->ldev->md.uuid_lock);
for (i = UI_CURRENT; i < UI_SIZE; i++)
- p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
-
- mdev->comm_bm_set = drbd_bm_total_weight(mdev);
- p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
- uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
- uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
- uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
- p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+ p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
- put_ldev(mdev);
+ device->comm_bm_set = drbd_bm_total_weight(device);
+ p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+ rcu_read_lock();
+ uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+ rcu_read_unlock();
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+ uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+ p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
- (struct p_header80 *)&p, sizeof(p));
+ put_ldev(device);
+ return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
}
-int drbd_send_uuids(struct drbd_conf *mdev)
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
{
- return _drbd_send_uuids(mdev, 0);
+ return _drbd_send_uuids(peer_device, 0);
}
-int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
{
- return _drbd_send_uuids(mdev, 8);
+ return _drbd_send_uuids(peer_device, 8);
}
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ u64 *uuid = device->ldev->md.uuid;
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END]);
+ put_ldev(device);
+ } else {
+ drbd_info(device, "%s effective data uuid: %016llX\n",
+ text,
+ (unsigned long long)device->ed_uuid);
+ }
+}
-int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
{
- struct p_rs_uuid p;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_rs_uuid *p;
+ u64 uuid;
- p.uuid = cpu_to_be64(val);
+ D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
- (struct p_header80 *)&p, sizeof(p));
+ uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (uuid && uuid != UUID_JUST_CREATED)
+ uuid = uuid + UUID_NEW_BM_OFFSET;
+ else
+ get_random_bytes(&uuid, sizeof(u64));
+ drbd_uuid_set(device, UI_BITMAP, uuid);
+ drbd_print_uuids(device, "updated sync UUID");
+ drbd_md_sync(device);
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->uuid = cpu_to_be64(uuid);
+ drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+ }
}
-int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{
- struct p_sizes p;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
sector_t d_size, u_size;
int q_order_type;
- int ok;
-
- if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
- D_ASSERT(mdev->ldev->backing_bdev);
- d_size = drbd_get_max_capacity(mdev->ldev);
- u_size = mdev->ldev->dc.disk_size;
- q_order_type = drbd_queue_order_type(mdev);
- put_ldev(mdev);
+ unsigned int max_bio_size;
+
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ D_ASSERT(device, device->ldev->backing_bdev);
+ d_size = drbd_get_max_capacity(device->ldev);
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ q_order_type = drbd_queue_order_type(device);
+ max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+ put_ldev(device);
} else {
d_size = 0;
u_size = 0;
q_order_type = QUEUE_ORDERED_NONE;
+ max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
}
- p.d_size = cpu_to_be64(d_size);
- p.u_size = cpu_to_be64(u_size);
- p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
- p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
- p.queue_order_type = cpu_to_be16(q_order_type);
- p.dds_flags = cpu_to_be16(flags);
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ if (peer_device->connection->agreed_pro_version <= 94)
+ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (peer_device->connection->agreed_pro_version < 100)
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
}
/**
- * drbd_send_state() - Sends the drbd state to the peer
- * @mdev: DRBD device.
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device: DRBD peer device.
*/
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
{
- struct socket *sock;
- struct p_state p;
- int ok = 0;
+ struct drbd_socket *sock;
+ struct p_state *p;
- /* Grab state lock so we wont send state if we're in the middle
- * of a cluster wide state change on another thread */
- drbd_state_lock(mdev);
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device: DRBD peer device.
+ * @state: the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+ struct drbd_socket *sock;
+ struct p_state *p;
- mutex_lock(&mdev->data.mutex);
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
- p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
- sock = mdev->data.socket;
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+ struct drbd_socket *sock;
+ struct p_req_state *p;
- if (likely(sock != NULL)) {
- ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header80 *)&p, sizeof(p), 0);
- }
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
- mutex_unlock(&mdev->data.mutex);
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_packet cmd;
+ struct drbd_socket *sock;
+ struct p_req_state *p;
- drbd_state_unlock(mdev);
- return ok;
+ cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
}
-int drbd_send_state_req(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
{
- struct p_req_state p;
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+ }
+}
- p.mask = cpu_to_be32(mask.i);
- p.val = cpu_to_be32(val.i);
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+ enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
- (struct p_header80 *)&p, sizeof(p));
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+ }
}
-int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
{
- struct p_req_state_reply p;
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
- p.retcode = cpu_to_be32(retcode);
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
- (struct p_header80 *)&p, sizeof(p));
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
}
-int fill_bitmap_rle_bits(struct drbd_conf *mdev,
- struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+ struct p_compressed_bm *p,
+ unsigned int size,
+ struct bm_xfer_ctx *c)
{
struct bitstream bs;
unsigned long plain_bits;
@@ -1992,19 +1083,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
unsigned long rl;
unsigned len;
unsigned toggle;
- int bits;
+ int bits, use_rle;
/* may we use this feature? */
- if ((mdev->sync_conf.use_rle == 0) ||
- (mdev->agreed_pro_version < 90))
- return 0;
+ rcu_read_lock();
+ use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+ rcu_read_unlock();
+ if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+ return 0;
if (c->bit_offset >= c->bm_bits)
return 0; /* nothing to do. */
/* use at most thus many bytes */
- bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
- memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+ bitstream_init(&bs, p->code, size, 0);
+ memset(p->code, 0, size);
/* plain bits covered in this code string */
plain_bits = 0;
@@ -2016,8 +1109,8 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
/* see how much plain bits we can stuff into one packet
* using RLE and VLI. */
do {
- tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
- : _drbd_bm_find_next(mdev, c->bit_offset);
+ tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+ : _drbd_bm_find_next(device, c->bit_offset);
if (tmp == -1UL)
tmp = c->bm_bits;
rl = tmp - c->bit_offset;
@@ -2026,18 +1119,18 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
if (rl == 0) {
/* the first checked bit was set,
* store start value, */
- DCBP_set_start(p, 1);
+ dcbp_set_start(p, 1);
/* but skip encoding of zero run length */
toggle = !toggle;
continue;
}
- DCBP_set_start(p, 0);
+ dcbp_set_start(p, 0);
}
/* paranoia: catch zero runlength.
* can only happen if bitmap is modified while we scan it. */
if (rl == 0) {
- dev_err(DEV, "unexpected zero runlength while encoding bitmap "
+ drbd_err(device, "unexpected zero runlength while encoding bitmap "
"t:%u bo:%lu\n", toggle, c->bit_offset);
return -1;
}
@@ -2046,7 +1139,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
if (bits == -ENOBUFS) /* buffer full */
break;
if (bits <= 0) {
- dev_err(DEV, "error while encoding bitmap: %d\n", bits);
+ drbd_err(device, "error while encoding bitmap: %d\n", bits);
return 0;
}
@@ -2071,285 +1164,293 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
bm_xfer_ctx_bit_to_word_offset(c);
/* store pad_bits */
- DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+ dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
return len;
}
-enum { OK, FAILED, DONE }
-send_bitmap_rle_or_plain(struct drbd_conf *mdev,
- struct p_header80 *h, struct bm_xfer_ctx *c)
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
{
- struct p_compressed_bm *p = (void*)h;
- unsigned long num_words;
- int len;
- int ok;
-
- len = fill_bitmap_rle_bits(mdev, p, c);
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ struct p_compressed_bm *p = sock->sbuf + header_size;
+ int len, err;
+ len = fill_bitmap_rle_bits(device, p,
+ DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
if (len < 0)
- return FAILED;
+ return -EIO;
if (len) {
- DCBP_set_code(p, RLE_VLI_Bits);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
- sizeof(*p) + len, 0);
-
+ dcbp_set_code(p, RLE_VLI_Bits);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+ P_COMPRESSED_BITMAP, sizeof(*p) + len,
+ NULL, 0);
c->packets[0]++;
- c->bytes[0] += sizeof(*p) + len;
+ c->bytes[0] += header_size + sizeof(*p) + len;
if (c->bit_offset >= c->bm_bits)
len = 0; /* DONE */
} else {
/* was not compressible.
* send a buffer full of plain text bits instead. */
- num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- len = num_words * sizeof(long);
+ unsigned int data_size;
+ unsigned long num_words;
+ unsigned long *p = sock->sbuf + header_size;
+
+ data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ len = num_words * sizeof(*p);
if (len)
- drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
- h, sizeof(struct p_header80) + len, 0);
+ drbd_bm_get_lel(device, c->word_offset, num_words, p);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
- c->bytes[1] += sizeof(struct p_header80) + len;
+ c->bytes[1] += header_size + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
}
- ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
-
- if (ok == DONE)
- INFO_bm_xfer_stats(mdev, "send", c);
- return ok;
+ if (!err) {
+ if (len == 0) {
+ INFO_bm_xfer_stats(device, "send", c);
+ return 0;
+ } else
+ return 1;
+ }
+ return -EIO;
}
/* See the comment at receive_bitmap() */
-int _drbd_send_bitmap(struct drbd_conf *mdev)
+static int _drbd_send_bitmap(struct drbd_device *device)
{
struct bm_xfer_ctx c;
- struct p_header80 *p;
- int ret;
+ int err;
- ERR_IF(!mdev->bitmap) return FALSE;
+ if (!expect(device->bitmap))
+ return false;
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- p = (struct p_header80 *) __get_free_page(GFP_NOIO);
- if (!p) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
- return FALSE;
- }
-
- if (get_ldev(mdev)) {
- if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
- dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
- drbd_bm_set_all(mdev);
- if (drbd_bm_write(mdev)) {
+ if (get_ldev(device)) {
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+ drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+ drbd_bm_set_all(device);
+ if (drbd_bm_write(device)) {
/* write_bm did fail! Leave full sync flag set in Meta P_DATA
* but otherwise process as per normal - need to tell other
* side that a full resync is required! */
- dev_err(DEV, "Failed to write bitmap to disk!\n");
+ drbd_err(device, "Failed to write bitmap to disk!\n");
} else {
- drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
- drbd_md_sync(mdev);
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
}
}
- put_ldev(mdev);
+ put_ldev(device);
}
c = (struct bm_xfer_ctx) {
- .bm_bits = drbd_bm_bits(mdev),
- .bm_words = drbd_bm_words(mdev),
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
};
do {
- ret = send_bitmap_rle_or_plain(mdev, p, &c);
- } while (ret == OK);
+ err = send_bitmap_rle_or_plain(device, &c);
+ } while (err > 0);
- free_page((unsigned long) p);
- return (ret == DONE);
+ return err == 0;
}
-int drbd_send_bitmap(struct drbd_conf *mdev)
+int drbd_send_bitmap(struct drbd_device *device)
{
- int err;
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ int err = -1;
- if (!drbd_get_data_sock(mdev))
- return -1;
- err = !_drbd_send_bitmap(mdev);
- drbd_put_data_sock(mdev);
+ mutex_lock(&sock->mutex);
+ if (sock->socket)
+ err = !_drbd_send_bitmap(device);
+ mutex_unlock(&sock->mutex);
return err;
}
-int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
{
- int ok;
- struct p_barrier_ack p;
+ struct drbd_socket *sock;
+ struct p_barrier_ack *p;
- p.barrier = barrier_nr;
- p.set_size = cpu_to_be32(set_size);
+ if (connection->cstate < C_WF_REPORT_PARAMS)
+ return;
- if (mdev->state.conn < C_CONNECTED)
- return FALSE;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return;
+ p->barrier = barrier_nr;
+ p->set_size = cpu_to_be32(set_size);
+ conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
}
/**
* _drbd_send_ack() - Sends an ack packet
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @cmd: Packet command code.
* @sector: sector, needs to be in big endian byte order
* @blksize: size in byte, needs to be in big endian byte order
* @block_id: Id, big endian byte order
*/
-static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- u64 sector,
- u32 blksize,
- u64 block_id)
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ u64 sector, u32 blksize, u64 block_id)
{
- int ok;
- struct p_block_ack p;
+ struct drbd_socket *sock;
+ struct p_block_ack *p;
- p.sector = sector;
- p.block_id = block_id;
- p.blksize = blksize;
- p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+ if (peer_device->device->state.conn < C_CONNECTED)
+ return -EIO;
- if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
- return FALSE;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = sector;
+ p->block_id = block_id;
+ p->blksize = blksize;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
}
/* dp->sector and dp->block_id already/still in network byte order,
* data_size is payload size according to dp->head,
* and may need to be corrected for digest size. */
-int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size)
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_data *dp, int data_size)
{
- data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
- return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
- dp->block_id);
+ if (peer_device->connection->peer_integrity_tfm)
+ data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
}
-int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp)
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_block_req *rp)
{
- return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+ _drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
}
/**
* drbd_send_ack() - Sends an ack packet
- * @mdev: DRBD device.
- * @cmd: Packet command code.
- * @e: Epoch entry.
+ * @device: DRBD device
+ * @cmd: packet command code
+ * @peer_req: peer request
*/
-int drbd_send_ack(struct drbd_conf *mdev,
- enum drbd_packets cmd, struct drbd_epoch_entry *e)
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
- return _drbd_send_ack(mdev, cmd,
- cpu_to_be64(e->sector),
- cpu_to_be32(e->size),
- e->block_id);
+ return _drbd_send_ack(peer_device, cmd,
+ cpu_to_be64(peer_req->i.sector),
+ cpu_to_be32(peer_req->i.size),
+ peer_req->block_id);
}
/* This function misuses the block_id field to signal if the blocks
* are is sync or not. */
-int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id)
{
- return _drbd_send_ack(mdev, cmd,
+ return _drbd_send_ack(peer_device, cmd,
cpu_to_be64(sector),
cpu_to_be32(blksize),
cpu_to_be64(block_id));
}
-int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
sector_t sector, int size, u64 block_id)
{
- int ok;
- struct p_block_req p;
-
- p.sector = cpu_to_be64(sector);
- p.block_id = block_id;
- p.blksize = cpu_to_be32(size);
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = block_id;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
}
-int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector, int size,
- void *digest, int digest_size,
- enum drbd_packets cmd)
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+ void *digest, int digest_size, enum drbd_packet cmd)
{
- int ok;
- struct p_block_req p;
-
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbeef;
- p.blksize = cpu_to_be32(size);
-
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(cmd);
- p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
-
- mutex_lock(&mdev->data.mutex);
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
- ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+ /* FIXME: Put the digest into the preallocated socket buffer. */
- mutex_unlock(&mdev->data.mutex);
-
- return ok;
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
}
-int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
{
- int ok;
- struct p_block_req p;
-
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbabe;
- p.blksize = cpu_to_be32(size);
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
}
/* called on sndtimeo
- * returns FALSE if we should retry,
- * TRUE if we think connection is dead
+ * returns false if we should retry,
+ * true if we think connection is dead
*/
-static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
{
int drop_it;
- /* long elapsed = (long)(jiffies - mdev->last_received); */
+ /* long elapsed = (long)(jiffies - device->last_received); */
- drop_it = mdev->meta.socket == sock
- || !mdev->asender.task
- || get_t_state(&mdev->asender) != Running
- || mdev->state.conn < C_CONNECTED;
+ drop_it = connection->meta.socket == sock
+ || !connection->asender.task
+ || get_t_state(&connection->asender) != RUNNING
+ || connection->cstate < C_WF_REPORT_PARAMS;
if (drop_it)
- return TRUE;
+ return true;
- drop_it = !--mdev->ko_count;
+ drop_it = !--connection->ko_count;
if (!drop_it) {
- dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
- current->comm, current->pid, mdev->ko_count);
- request_ping(mdev);
+ drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, connection->ko_count);
+ request_ping(connection);
}
- return drop_it; /* && (mdev->state == R_PRIMARY) */;
+ return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+ struct sock *sk = connection->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &connection->flags);
}
/* The idea of sendpage seems to be to put some kind of reference
@@ -2373,22 +1474,29 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
* As a workaround, we disable sendpage on pages
* with page_count == 0 or PageSlab.
*/
-static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
- int offset, size_t size, unsigned msg_flags)
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+ int offset, size_t size, unsigned msg_flags)
{
- int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+ struct socket *socket;
+ void *addr;
+ int err;
+
+ socket = peer_device->connection->data.socket;
+ addr = kmap(page) + offset;
+ err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
kunmap(page);
- if (sent == size)
- mdev->send_cnt += size>>9;
- return sent == size;
+ if (!err)
+ peer_device->device->send_cnt += size >> 9;
+ return err;
}
-static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
int offset, size_t size, unsigned msg_flags)
{
+ struct socket *socket = peer_device->connection->data.socket;
mm_segment_t oldfs = get_fs();
- int sent, ok;
int len = size;
+ int err = -EIO;
/* e.g. XFS meta- & log-data is in slab pages, which have a
* page_count of 0 and/or have PageSlab() set.
@@ -2397,205 +1505,239 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
* __page_cache_release a page that would actually still be referenced
* by someone, leading to some obscure delayed Oops somewhere else. */
if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
- return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
+ return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
msg_flags |= MSG_NOSIGNAL;
- drbd_update_congested(mdev);
+ drbd_update_congested(peer_device->connection);
set_fs(KERNEL_DS);
do {
- sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
- offset, len,
- msg_flags);
- if (sent == -EAGAIN) {
- if (we_should_drop_the_connection(mdev,
- mdev->data.socket))
- break;
- else
- continue;
- }
+ int sent;
+
+ sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
if (sent <= 0) {
- dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(peer_device->connection, socket))
+ break;
+ continue;
+ }
+ drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
__func__, (int)size, len, sent);
+ if (sent < 0)
+ err = sent;
break;
}
len -= sent;
offset += sent;
- } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
+ } while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
set_fs(oldfs);
- clear_bit(NET_CONGESTED, &mdev->flags);
+ clear_bit(NET_CONGESTED, &peer_device->connection->flags);
- ok = (len == 0);
- if (likely(ok))
- mdev->send_cnt += size>>9;
- return ok;
+ if (len == 0) {
+ err = 0;
+ peer_device->device->send_cnt += size >> 9;
+ }
+ return err;
}
-static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
{
- struct bio_vec *bvec;
- int i;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (!_drbd_no_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_no_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter)
+ ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
-static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
{
- struct bio_vec *bvec;
- int i;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
/* hint all but last page with MSG_MORE */
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (!_drbd_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
-static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+ struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
- unsigned len = e->size;
+ struct page *page = peer_req->pages;
+ unsigned len = peer_req->i.size;
+ int err;
+
/* hint all but last page with MSG_MORE */
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- if (!_drbd_send_page(mdev, page, 0, l,
- page_chain_next(page) ? MSG_MORE : 0))
- return 0;
+
+ err = _drbd_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ if (err)
+ return err;
len -= l;
}
- return 1;
+ return 0;
}
-static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
+static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
{
- if (mdev->agreed_pro_version >= 95)
+ if (connection->agreed_pro_version >= 95)
return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
- (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
(bi_rw & REQ_FUA ? DP_FUA : 0) |
(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
else
- return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
+ return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
}
-/* Used to send write requests
- * R_PRIMARY -> Peer (P_DATA)
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer (P_DATA, P_TRIM)
*/
-int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
{
- int ok = 1;
- struct p_data p;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
unsigned int dp_flags = 0;
- void *dgb;
int dgs;
+ int err;
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
- if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(P_DATA);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(P_DATA);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- }
-
- p.sector = cpu_to_be64(req->sector);
- p.block_id = (unsigned long)req;
- p.seq_num = cpu_to_be32(req->seq_num =
- atomic_add_return(1, &mdev->packet_seq));
-
- dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
-
- if (mdev->state.conn >= C_SYNC_SOURCE &&
- mdev->state.conn <= C_PAUSED_SYNC_T)
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ dgs = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->block_id = (unsigned long)req;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+ dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+ if (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
+ if (peer_device->connection->agreed_pro_version >= 100) {
+ if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ if (req->rq_state & RQ_EXP_WRITE_ACK)
+ dp_flags |= DP_SEND_WRITE_ACK;
+ }
+ p->dp_flags = cpu_to_be32(dp_flags);
+
+ if (dp_flags & DP_DISCARD) {
+ struct p_trim *t = (struct p_trim*)p;
+ t->size = cpu_to_be32(req->i.size);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+ goto out;
+ }
- p.dp_flags = cpu_to_be32(dp_flags);
- set_bit(UNPLUG_REMOTE, &mdev->flags);
- ok = (sizeof(p) ==
- drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
- ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
- }
- if (ok) {
- if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
- ok = _drbd_send_bio(mdev, req->master_bio);
+ /* our digest is still only over the payload.
+ * TRIM does not carry any payload. */
+ if (dgs)
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+ if (!err) {
+ /* For protocol A, we have to memcpy the payload into
+ * socket buffers, as we may complete right away
+ * as soon as we handed it over to tcp, at which point the data
+ * pages may become invalid.
+ *
+ * For data-integrity enabled, we copy it as well, so we can be
+ * sure that even if the bio pages may still be modified, it
+ * won't change the data on the wire, thus if the digest checks
+ * out ok after sending on this side, but does not fit on the
+ * receiving side, we sure have detected corruption elsewhere.
+ */
+ if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+ err = _drbd_send_bio(peer_device, req->master_bio);
else
- ok = _drbd_send_zc_bio(mdev, req->master_bio);
+ err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+ /* double check digest, sometimes buffers have been modified in flight. */
+ if (dgs > 0 && dgs <= 64) {
+ /* 64 byte, 512 bit, is the largest digest size
+ * currently supported in kernel crypto. */
+ unsigned char digest[64];
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+ if (memcmp(p + 1, digest, dgs)) {
+ drbd_warn(device,
+ "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+ (unsigned long long)req->i.sector, req->i.size);
+ }
+ } /* else if (dgs > 64) {
+ ... Be noisy about digest too large ...
+ } */
}
+out:
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- drbd_put_data_sock(mdev);
-
- return ok;
+ return err;
}
/* answer packet, used to send data back for read requests:
* Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
* C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
*/
-int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e)
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
- int ok;
- struct p_data p;
- void *dgb;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ int err;
int dgs;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
- if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(cmd);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(cmd);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- }
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+
+ dgs = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->block_id = peer_req->block_id;
+ p->seq_num = 0; /* unused */
+ p->dp_flags = 0;
+ if (dgs)
+ drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+ if (!err)
+ err = _drbd_send_zc_ee(peer_device, peer_req);
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- p.sector = cpu_to_be64(e->sector);
- p.block_id = e->block_id;
- /* p.seq_num = 0; No sequence numbers here.. */
-
- /* Only called by our kernel thread.
- * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
- * in response to admin command or module unload.
- */
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
- ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
- }
- if (ok)
- ok = _drbd_send_zc_ee(mdev, e);
+ return err;
+}
- drbd_put_data_sock(mdev);
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
- return ok;
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->blksize = cpu_to_be32(req->i.size);
+ return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
}
/*
@@ -2614,7 +1756,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
/*
* you must have down()ed the appropriate [m]sock_mutex elsewhere!
*/
-int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
void *buf, size_t size, unsigned msg_flags)
{
struct kvec iov;
@@ -2622,7 +1764,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
int rv, sent = 0;
if (!sock)
- return -1000;
+ return -EBADR;
/* THINK if (signal_pending) return ... ? */
@@ -2635,9 +1777,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
- if (sock == mdev->data.socket) {
- mdev->ko_count = mdev->net_conf->ko_count;
- drbd_update_congested(mdev);
+ if (sock == connection->data.socket) {
+ rcu_read_lock();
+ connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+ rcu_read_unlock();
+ drbd_update_congested(connection);
}
do {
/* STRANGE
@@ -2651,12 +1795,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
*/
rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (rv == -EAGAIN) {
- if (we_should_drop_the_connection(mdev, sock))
+ if (we_should_drop_the_connection(connection, sock))
break;
else
continue;
}
- D_ASSERT(rv != 0);
if (rv == -EINTR) {
flush_signals(current);
rv = 0;
@@ -2668,34 +1811,52 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
iov.iov_len -= rv;
} while (sent < size);
- if (sock == mdev->data.socket)
- clear_bit(NET_CONGESTED, &mdev->flags);
+ if (sock == connection->data.socket)
+ clear_bit(NET_CONGESTED, &connection->flags);
if (rv <= 0) {
if (rv != -EAGAIN) {
- dev_err(DEV, "%s_sendmsg returned %d\n",
- sock == mdev->meta.socket ? "msock" : "sock",
- rv);
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ drbd_err(connection, "%s_sendmsg returned %d\n",
+ sock == connection->meta.socket ? "msock" : "sock",
+ rv);
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
} else
- drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+ conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
}
return sent;
}
+/**
+ * drbd_send_all - Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+ size_t size, unsigned msg_flags)
+{
+ int err;
+
+ err = drbd_send(connection, sock, buffer, size, msg_flags);
+ if (err < 0)
+ return err;
+ if (err != size)
+ return -EIO;
+ return 0;
+}
+
static int drbd_open(struct block_device *bdev, fmode_t mode)
{
- struct drbd_conf *mdev = bdev->bd_disk->private_data;
+ struct drbd_device *device = bdev->bd_disk->private_data;
unsigned long flags;
int rv = 0;
mutex_lock(&drbd_main_mutex);
- spin_lock_irqsave(&mdev->req_lock, flags);
- /* to have a stable mdev->state.role
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ /* to have a stable device->state.role
* and no race with updating open_cnt */
- if (mdev->state.role != R_PRIMARY) {
+ if (device->state.role != R_PRIMARY) {
if (mode & FMODE_WRITE)
rv = -EROFS;
else if (!allow_oos)
@@ -2703,214 +1864,151 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
}
if (!rv)
- mdev->open_cnt++;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ device->open_cnt++;
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
mutex_unlock(&drbd_main_mutex);
return rv;
}
-static int drbd_release(struct gendisk *gd, fmode_t mode)
+static void drbd_release(struct gendisk *gd, fmode_t mode)
{
- struct drbd_conf *mdev = gd->private_data;
+ struct drbd_device *device = gd->private_data;
mutex_lock(&drbd_main_mutex);
- mdev->open_cnt--;
+ device->open_cnt--;
mutex_unlock(&drbd_main_mutex);
- return 0;
}
-static void drbd_unplug_fn(struct request_queue *q)
+static void drbd_set_defaults(struct drbd_device *device)
{
- struct drbd_conf *mdev = q->queuedata;
-
- /* unplug FIRST */
- spin_lock_irq(q->queue_lock);
- blk_remove_plug(q);
- spin_unlock_irq(q->queue_lock);
-
- /* only if connected */
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
- D_ASSERT(mdev->state.role == R_PRIMARY);
- if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
- /* add to the data.work queue,
- * unless already queued.
- * XXX this might be a good addition to drbd_queue_work
- * anyways, to detect "double queuing" ... */
- if (list_empty(&mdev->unplug_work.list))
- drbd_queue_work(&mdev->data.work,
- &mdev->unplug_work);
- }
- }
- spin_unlock_irq(&mdev->req_lock);
-
- if (mdev->state.disk >= D_INCONSISTENT)
- drbd_kick_lo(mdev);
-}
-
-static void drbd_set_defaults(struct drbd_conf *mdev)
-{
- /* This way we get a compile error when sync_conf grows,
- and we forgot to initialize it here */
- mdev->sync_conf = (struct syncer_conf) {
- /* .rate = */ DRBD_RATE_DEF,
- /* .after = */ DRBD_AFTER_DEF,
- /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
- /* .verify_alg = */ {}, 0,
- /* .cpu_mask = */ {}, 0,
- /* .csums_alg = */ {}, 0,
- /* .use_rle = */ 0,
- /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
- /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
- /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
- /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
- /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
- /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
- };
-
- /* Have to use that way, because the layout differs between
- big endian and little endian */
- mdev->state = (union drbd_state) {
+ /* Beware! The actual layout differs
+ * between big endian and little endian */
+ device->state = (union drbd_dev_state) {
{ .role = R_SECONDARY,
.peer = R_UNKNOWN,
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
- .susp = 0,
- .susp_nod = 0,
- .susp_fen = 0
} };
}
-void drbd_init_set_defaults(struct drbd_conf *mdev)
+void drbd_init_set_defaults(struct drbd_device *device)
{
/* the memset(,0,) did most of this.
* note: only assignments, no allocation in here */
- drbd_set_defaults(mdev);
-
- atomic_set(&mdev->ap_bio_cnt, 0);
- atomic_set(&mdev->ap_pending_cnt, 0);
- atomic_set(&mdev->rs_pending_cnt, 0);
- atomic_set(&mdev->unacked_cnt, 0);
- atomic_set(&mdev->local_cnt, 0);
- atomic_set(&mdev->net_cnt, 0);
- atomic_set(&mdev->packet_seq, 0);
- atomic_set(&mdev->pp_in_use, 0);
- atomic_set(&mdev->pp_in_use_by_net, 0);
- atomic_set(&mdev->rs_sect_in, 0);
- atomic_set(&mdev->rs_sect_ev, 0);
-
- mutex_init(&mdev->md_io_mutex);
- mutex_init(&mdev->data.mutex);
- mutex_init(&mdev->meta.mutex);
- sema_init(&mdev->data.work.s, 0);
- sema_init(&mdev->meta.work.s, 0);
- mutex_init(&mdev->state_mutex);
-
- spin_lock_init(&mdev->data.work.q_lock);
- spin_lock_init(&mdev->meta.work.q_lock);
-
- spin_lock_init(&mdev->al_lock);
- spin_lock_init(&mdev->req_lock);
- spin_lock_init(&mdev->peer_seq_lock);
- spin_lock_init(&mdev->epoch_lock);
-
- INIT_LIST_HEAD(&mdev->active_ee);
- INIT_LIST_HEAD(&mdev->sync_ee);
- INIT_LIST_HEAD(&mdev->done_ee);
- INIT_LIST_HEAD(&mdev->read_ee);
- INIT_LIST_HEAD(&mdev->net_ee);
- INIT_LIST_HEAD(&mdev->resync_reads);
- INIT_LIST_HEAD(&mdev->data.work.q);
- INIT_LIST_HEAD(&mdev->meta.work.q);
- INIT_LIST_HEAD(&mdev->resync_work.list);
- INIT_LIST_HEAD(&mdev->unplug_work.list);
- INIT_LIST_HEAD(&mdev->go_diskless.list);
- INIT_LIST_HEAD(&mdev->md_sync_work.list);
- INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
-
- mdev->resync_work.cb = w_resync_inactive;
- mdev->unplug_work.cb = w_send_write_hint;
- mdev->go_diskless.cb = w_go_diskless;
- mdev->md_sync_work.cb = w_md_sync;
- mdev->bm_io_work.w.cb = w_bitmap_io;
- init_timer(&mdev->resync_timer);
- init_timer(&mdev->md_sync_timer);
- mdev->resync_timer.function = resync_timer_fn;
- mdev->resync_timer.data = (unsigned long) mdev;
- mdev->md_sync_timer.function = md_sync_timer_fn;
- mdev->md_sync_timer.data = (unsigned long) mdev;
-
- init_waitqueue_head(&mdev->misc_wait);
- init_waitqueue_head(&mdev->state_wait);
- init_waitqueue_head(&mdev->net_cnt_wait);
- init_waitqueue_head(&mdev->ee_wait);
- init_waitqueue_head(&mdev->al_wait);
- init_waitqueue_head(&mdev->seq_wait);
-
- drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
- drbd_thread_init(mdev, &mdev->worker, drbd_worker);
- drbd_thread_init(mdev, &mdev->asender, drbd_asender);
-
- mdev->agreed_pro_version = PRO_VERSION_MAX;
- mdev->write_ordering = WO_bdev_flush;
- mdev->resync_wenr = LC_FREE;
-}
-
-void drbd_mdev_cleanup(struct drbd_conf *mdev)
+ drbd_set_defaults(device);
+
+ atomic_set(&device->ap_bio_cnt, 0);
+ atomic_set(&device->ap_pending_cnt, 0);
+ atomic_set(&device->rs_pending_cnt, 0);
+ atomic_set(&device->unacked_cnt, 0);
+ atomic_set(&device->local_cnt, 0);
+ atomic_set(&device->pp_in_use_by_net, 0);
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ atomic_set(&device->ap_in_flight, 0);
+ atomic_set(&device->md_io_in_use, 0);
+
+ mutex_init(&device->own_state_mutex);
+ device->state_mutex = &device->own_state_mutex;
+
+ spin_lock_init(&device->al_lock);
+ spin_lock_init(&device->peer_seq_lock);
+
+ INIT_LIST_HEAD(&device->active_ee);
+ INIT_LIST_HEAD(&device->sync_ee);
+ INIT_LIST_HEAD(&device->done_ee);
+ INIT_LIST_HEAD(&device->read_ee);
+ INIT_LIST_HEAD(&device->net_ee);
+ INIT_LIST_HEAD(&device->resync_reads);
+ INIT_LIST_HEAD(&device->resync_work.list);
+ INIT_LIST_HEAD(&device->unplug_work.list);
+ INIT_LIST_HEAD(&device->go_diskless.list);
+ INIT_LIST_HEAD(&device->md_sync_work.list);
+ INIT_LIST_HEAD(&device->start_resync_work.list);
+ INIT_LIST_HEAD(&device->bm_io_work.w.list);
+
+ device->resync_work.cb = w_resync_timer;
+ device->unplug_work.cb = w_send_write_hint;
+ device->go_diskless.cb = w_go_diskless;
+ device->md_sync_work.cb = w_md_sync;
+ device->bm_io_work.w.cb = w_bitmap_io;
+ device->start_resync_work.cb = w_start_resync;
+
+ init_timer(&device->resync_timer);
+ init_timer(&device->md_sync_timer);
+ init_timer(&device->start_resync_timer);
+ init_timer(&device->request_timer);
+ device->resync_timer.function = resync_timer_fn;
+ device->resync_timer.data = (unsigned long) device;
+ device->md_sync_timer.function = md_sync_timer_fn;
+ device->md_sync_timer.data = (unsigned long) device;
+ device->start_resync_timer.function = start_resync_timer_fn;
+ device->start_resync_timer.data = (unsigned long) device;
+ device->request_timer.function = request_timer_fn;
+ device->request_timer.data = (unsigned long) device;
+
+ init_waitqueue_head(&device->misc_wait);
+ init_waitqueue_head(&device->state_wait);
+ init_waitqueue_head(&device->ee_wait);
+ init_waitqueue_head(&device->al_wait);
+ init_waitqueue_head(&device->seq_wait);
+
+ device->resync_wenr = LC_FREE;
+ device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+ device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
{
int i;
- if (mdev->receiver.t_state != None)
- dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
- mdev->receiver.t_state);
-
- /* no need to lock it, I'm the only thread alive */
- if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
- dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
- mdev->al_writ_cnt =
- mdev->bm_writ_cnt =
- mdev->read_cnt =
- mdev->recv_cnt =
- mdev->send_cnt =
- mdev->writ_cnt =
- mdev->p_size =
- mdev->rs_start =
- mdev->rs_total =
- mdev->rs_failed = 0;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
+ if (first_peer_device(device)->connection->receiver.t_state != NONE)
+ drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+ first_peer_device(device)->connection->receiver.t_state);
+
+ device->al_writ_cnt =
+ device->bm_writ_cnt =
+ device->read_cnt =
+ device->recv_cnt =
+ device->send_cnt =
+ device->writ_cnt =
+ device->p_size =
+ device->rs_start =
+ device->rs_total =
+ device->rs_failed = 0;
+ device->rs_last_events = 0;
+ device->rs_last_sect_ev = 0;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = 0;
- mdev->rs_mark_time[i] = 0;
+ device->rs_mark_left[i] = 0;
+ device->rs_mark_time[i] = 0;
}
- D_ASSERT(mdev->net_conf == NULL);
+ D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
- drbd_set_my_capacity(mdev, 0);
- if (mdev->bitmap) {
+ drbd_set_my_capacity(device, 0);
+ if (device->bitmap) {
/* maybe never allocated. */
- drbd_bm_resize(mdev, 0, 1);
- drbd_bm_cleanup(mdev);
+ drbd_bm_resize(device, 0, 1);
+ drbd_bm_cleanup(device);
}
- drbd_free_resources(mdev);
- clear_bit(AL_SUSPENDED, &mdev->flags);
+ drbd_free_bc(device->ldev);
+ device->ldev = NULL;
- /*
- * currently we drbd_init_ee only on module load, so
- * we may do drbd_release_ee only on module unload!
- */
- D_ASSERT(list_empty(&mdev->active_ee));
- D_ASSERT(list_empty(&mdev->sync_ee));
- D_ASSERT(list_empty(&mdev->done_ee));
- D_ASSERT(list_empty(&mdev->read_ee));
- D_ASSERT(list_empty(&mdev->net_ee));
- D_ASSERT(list_empty(&mdev->resync_reads));
- D_ASSERT(list_empty(&mdev->data.work.q));
- D_ASSERT(list_empty(&mdev->meta.work.q));
- D_ASSERT(list_empty(&mdev->resync_work.list));
- D_ASSERT(list_empty(&mdev->unplug_work.list));
- D_ASSERT(list_empty(&mdev->go_diskless.list));
+ clear_bit(AL_SUSPENDED, &device->flags);
+
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->net_ee));
+ D_ASSERT(device, list_empty(&device->resync_reads));
+ D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+ D_ASSERT(device, list_empty(&device->resync_work.list));
+ D_ASSERT(device, list_empty(&device->unplug_work.list));
+ D_ASSERT(device, list_empty(&device->go_diskless.list));
+
+ drbd_set_defaults(device);
}
@@ -2925,8 +2023,12 @@ static void drbd_destroy_mempools(void)
drbd_pp_vacant--;
}
- /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+ /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+ if (drbd_md_io_bio_set)
+ bioset_free(drbd_md_io_bio_set);
+ if (drbd_md_io_page_pool)
+ mempool_destroy(drbd_md_io_page_pool);
if (drbd_ee_mempool)
mempool_destroy(drbd_ee_mempool);
if (drbd_request_mempool)
@@ -2940,6 +2042,8 @@ static void drbd_destroy_mempools(void)
if (drbd_al_ext_cache)
kmem_cache_destroy(drbd_al_ext_cache);
+ drbd_md_io_bio_set = NULL;
+ drbd_md_io_page_pool = NULL;
drbd_ee_mempool = NULL;
drbd_request_mempool = NULL;
drbd_ee_cache = NULL;
@@ -2953,7 +2057,7 @@ static void drbd_destroy_mempools(void)
static int drbd_create_mempools(void)
{
struct page *page;
- const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
+ const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
int i;
/* prepare our caches and mempools */
@@ -2963,6 +2067,8 @@ static int drbd_create_mempools(void)
drbd_bm_ext_cache = NULL;
drbd_al_ext_cache = NULL;
drbd_pp_pool = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_md_io_bio_set = NULL;
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -2971,7 +2077,7 @@ static int drbd_create_mempools(void)
goto Enomem;
drbd_ee_cache = kmem_cache_create(
- "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+ "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
if (drbd_ee_cache == NULL)
goto Enomem;
@@ -2986,6 +2092,14 @@ static int drbd_create_mempools(void)
goto Enomem;
/* mempools */
+ drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_bio_set == NULL)
+ goto Enomem;
+
+ drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_page_pool == NULL)
+ goto Enomem;
+
drbd_request_mempool = mempool_create(number,
mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
if (drbd_request_mempool == NULL)
@@ -3029,289 +2143,758 @@ static struct notifier_block drbd_notifier = {
.notifier_call = drbd_notify_sys,
};
-static void drbd_release_ee_lists(struct drbd_conf *mdev)
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
{
int rr;
- rr = drbd_release_ee(mdev, &mdev->active_ee);
+ rr = drbd_free_peer_reqs(device, &device->active_ee);
if (rr)
- dev_err(DEV, "%d EEs in active list found!\n", rr);
+ drbd_err(device, "%d EEs in active list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->sync_ee);
+ rr = drbd_free_peer_reqs(device, &device->sync_ee);
if (rr)
- dev_err(DEV, "%d EEs in sync list found!\n", rr);
+ drbd_err(device, "%d EEs in sync list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->read_ee);
+ rr = drbd_free_peer_reqs(device, &device->read_ee);
if (rr)
- dev_err(DEV, "%d EEs in read list found!\n", rr);
+ drbd_err(device, "%d EEs in read list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->done_ee);
+ rr = drbd_free_peer_reqs(device, &device->done_ee);
if (rr)
- dev_err(DEV, "%d EEs in done list found!\n", rr);
+ drbd_err(device, "%d EEs in done list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->net_ee);
+ rr = drbd_free_peer_reqs(device, &device->net_ee);
if (rr)
- dev_err(DEV, "%d EEs in net list found!\n", rr);
+ drbd_err(device, "%d EEs in net list found!\n", rr);
}
-/* caution. no locking.
- * currently only used from module cleanup code. */
-static void drbd_delete_device(unsigned int minor)
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
{
- struct drbd_conf *mdev = minor_to_mdev(minor);
+ struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+ struct drbd_resource *resource = device->resource;
+ struct drbd_connection *connection;
- if (!mdev)
- return;
+ del_timer_sync(&device->request_timer);
/* paranoia asserts */
- if (mdev->open_cnt != 0)
- dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
- __FILE__ , __LINE__);
-
- ERR_IF (!list_empty(&mdev->data.work.q)) {
- struct list_head *lp;
- list_for_each(lp, &mdev->data.work.q) {
- dev_err(DEV, "lp = %p\n", lp);
- }
- };
+ D_ASSERT(device, device->open_cnt == 0);
/* end paranoia asserts */
- del_gendisk(mdev->vdisk);
-
/* cleanup stuff that may have been allocated during
* device (re-)configuration or state changes */
- if (mdev->this_bdev)
- bdput(mdev->this_bdev);
+ if (device->this_bdev)
+ bdput(device->this_bdev);
- drbd_free_resources(mdev);
+ drbd_free_bc(device->ldev);
+ device->ldev = NULL;
- drbd_release_ee_lists(mdev);
+ drbd_release_all_peer_reqs(device);
- /* should be free'd on disconnect? */
- kfree(mdev->ee_hash);
- /*
- mdev->ee_hash_s = 0;
- mdev->ee_hash = NULL;
- */
+ lc_destroy(device->act_log);
+ lc_destroy(device->resync);
- lc_destroy(mdev->act_log);
- lc_destroy(mdev->resync);
+ kfree(device->p_uuid);
+ /* device->p_uuid = NULL; */
- kfree(mdev->p_uuid);
- /* mdev->p_uuid = NULL; */
+ if (device->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(device);
+ __free_page(device->md_io_page);
+ put_disk(device->vdisk);
+ blk_cleanup_queue(device->rq_queue);
+ kfree(device->rs_plan_s);
+ kfree(first_peer_device(device));
+ kfree(device);
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
+ for_each_connection(connection, resource)
+ kref_put(&connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
- /* cleanup the rest that has been
- * allocated from drbd_new_device
- * and actually free the mdev itself */
- drbd_free_mdev(mdev);
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+ struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+ LIST_HEAD(writes);
+ struct drbd_request *req, *tmp;
+
+ spin_lock_irq(&retry->lock);
+ list_splice_init(&retry->writes, &writes);
+ spin_unlock_irq(&retry->lock);
+
+ list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->master_bio;
+ unsigned long start_time = req->start_time;
+ bool expected;
+
+ expected =
+ expect(atomic_read(&req->completion_ref) == 0) &&
+ expect(req->rq_state & RQ_POSTPONED) &&
+ expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+ if (!expected)
+ drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+ req, atomic_read(&req->completion_ref),
+ req->rq_state);
+
+ /* We still need to put one kref associated with the
+ * "completion_ref" going zero in the code path that queued it
+ * here. The request object may still be referenced by a
+ * frozen local req->private_bio, in case we force-detached.
+ */
+ kref_put(&req->kref, drbd_req_destroy);
+
+ /* A single suspended or otherwise blocking device may stall
+ * all others as well. Fortunately, this code path is to
+ * recover from a situation that "should not happen":
+ * concurrent writes in multi-primary setup.
+ * In a "normal" lifecycle, this workqueue is supposed to be
+ * destroyed without ever doing anything.
+ * If it turns out to be an issue anyways, we can do per
+ * resource (replication group) or per device (minor) retry
+ * workqueues instead.
+ */
+
+ /* We are not just doing generic_make_request(),
+ * as we want to keep the start_time information. */
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_time);
+ }
+}
+
+void drbd_restart_request(struct drbd_request *req)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&retry.lock, flags);
+ list_move_tail(&req->tl_requests, &retry.writes);
+ spin_unlock_irqrestore(&retry.lock, flags);
+
+ /* Drop the extra reference that would otherwise
+ * have been dropped by complete_master_bio.
+ * do_retry() needs to grab a new one. */
+ dec_ap_bio(req->device);
+
+ queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+ struct drbd_resource *resource =
+ container_of(kref, struct drbd_resource, kref);
+
+ idr_destroy(&resource->devices);
+ free_cpumask_var(resource->cpu_mask);
+ kfree(resource->name);
+ kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+ struct drbd_connection *connection, *tmp;
+
+ for_each_connection_safe(connection, tmp, resource) {
+ list_del(&connection->connections);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ kref_put(&resource->kref, drbd_destroy_resource);
}
static void drbd_cleanup(void)
{
unsigned int i;
+ struct drbd_device *device;
+ struct drbd_resource *resource, *tmp;
unregister_reboot_notifier(&drbd_notifier);
- drbd_nl_cleanup();
+ /* first remove proc,
+ * drbdsetup uses it's presence to detect
+ * whether DRBD is loaded.
+ * If we would get stuck in proc removal,
+ * but have netlink already deregistered,
+ * some drbdsetup commands may wait forever
+ * for an answer.
+ */
+ if (drbd_proc)
+ remove_proc_entry("drbd", NULL);
- if (minor_table) {
- if (drbd_proc)
- remove_proc_entry("drbd", NULL);
- i = minor_count;
- while (i--)
- drbd_delete_device(i);
- drbd_destroy_mempools();
- }
+ if (retry.wq)
+ destroy_workqueue(retry.wq);
+
+ drbd_genl_unregister();
+
+ idr_for_each_entry(&drbd_devices, device, i)
+ drbd_delete_device(device);
- kfree(minor_table);
+ /* not _rcu since, no other updater anymore. Genl already unregistered */
+ for_each_resource_safe(resource, tmp, &drbd_resources) {
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+ }
+ drbd_destroy_mempools();
unregister_blkdev(DRBD_MAJOR, "drbd");
+ idr_destroy(&drbd_devices);
+
printk(KERN_INFO "drbd: module cleanup done.\n");
}
/**
- * drbd_congested() - Callback for pdflush
+ * drbd_congested() - Callback for the flusher thread
* @congested_data: User data
- * @bdi_bits: Bits pdflush is currently interested in
+ * @bdi_bits: Bits the BDI flusher thread is currently interested in
*
* Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
*/
static int drbd_congested(void *congested_data, int bdi_bits)
{
- struct drbd_conf *mdev = congested_data;
+ struct drbd_device *device = congested_data;
struct request_queue *q;
char reason = '-';
int r = 0;
- if (!__inc_ap_bio_cond(mdev)) {
+ if (!may_inc_ap_bio(device)) {
/* DRBD has frozen IO */
r = bdi_bits;
reason = 'd';
goto out;
}
- if (get_ldev(mdev)) {
- q = bdev_get_queue(mdev->ldev->backing_bdev);
+ if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
+ r |= (1 << BDI_async_congested);
+ /* Without good local data, we would need to read from remote,
+ * and that would need the worker thread as well, which is
+ * currently blocked waiting for that usermode helper to
+ * finish.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ r |= (1 << BDI_sync_congested);
+ else
+ put_ldev(device);
+ r &= bdi_bits;
+ reason = 'c';
+ goto out;
+ }
+
+ if (get_ldev(device)) {
+ q = bdev_get_queue(device->ldev->backing_bdev);
r = bdi_congested(&q->backing_dev_info, bdi_bits);
- put_ldev(mdev);
+ put_ldev(device);
if (r)
reason = 'b';
}
- if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+ if (bdi_bits & (1 << BDI_async_congested) &&
+ test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
r |= (1 << BDI_async_congested);
reason = reason == 'b' ? 'a' : 'n';
}
out:
- mdev->congestion_reason = reason;
+ device->congestion_reason = reason;
return r;
}
-struct drbd_conf *drbd_new_device(unsigned int minor)
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
{
- struct drbd_conf *mdev;
+ spin_lock_init(&wq->q_lock);
+ INIT_LIST_HEAD(&wq->q);
+ init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+ struct drbd_work w;
+ struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+ struct completion_work *completion_work =
+ container_of(w, struct completion_work, w);
+
+ complete(&completion_work->done);
+ return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+ struct completion_work completion_work;
+
+ completion_work.w.cb = w_complete;
+ init_completion(&completion_work.done);
+ drbd_queue_work(work_queue, &completion_work.w);
+ wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ if (!name || !name[0])
+ return NULL;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ if (!strcmp(resource->name, name)) {
+ kref_get(&resource->kref);
+ goto found;
+ }
+ }
+ resource = NULL;
+found:
+ rcu_read_unlock();
+ return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_connection_rcu(connection, resource) {
+ if (connection->my_addr_len == my_addr_len &&
+ connection->peer_addr_len == peer_addr_len &&
+ !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+ !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+ kref_get(&connection->kref);
+ goto found;
+ }
+ }
+ }
+ connection = NULL;
+found:
+ rcu_read_unlock();
+ return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+ socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->rbuf)
+ return -ENOMEM;
+ socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->sbuf)
+ return -ENOMEM;
+ return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+ free_page((unsigned long) socket->sbuf);
+ free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+ drbd_free_sock(connection);
+
+ crypto_free_hash(connection->csums_tfm);
+ crypto_free_hash(connection->verify_tfm);
+ crypto_free_hash(connection->cram_hmac_tfm);
+ crypto_free_hash(connection->integrity_tfm);
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+
+ connection->csums_tfm = NULL;
+ connection->verify_tfm = NULL;
+ connection->cram_hmac_tfm = NULL;
+ connection->integrity_tfm = NULL;
+ connection->peer_integrity_tfm = NULL;
+ connection->int_dig_in = NULL;
+ connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+ struct drbd_connection *connection;
+ cpumask_var_t new_cpu_mask;
+ int err;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+ /*
+ retcode = ERR_NOMEM;
+ drbd_msg_put_info("unable to allocate cpumask");
+ */
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+ err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err) {
+ drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+ /* retcode = ERR_CPU_MASK_PARSE; */
+ goto fail;
+ }
+ }
+ resource->res_opts = *res_opts;
+ if (cpumask_empty(new_cpu_mask))
+ drbd_calc_cpu_mask(&new_cpu_mask);
+ if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(resource->cpu_mask, new_cpu_mask);
+ for_each_connection_rcu(connection, resource) {
+ connection->receiver.reset_cpu_mask = 1;
+ connection->asender.reset_cpu_mask = 1;
+ connection->worker.reset_cpu_mask = 1;
+ }
+ }
+ err = 0;
+
+fail:
+ free_cpumask_var(new_cpu_mask);
+ return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+ if (!resource)
+ goto fail;
+ resource->name = kstrdup(name, GFP_KERNEL);
+ if (!resource->name)
+ goto fail_free_resource;
+ if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+ goto fail_free_name;
+ kref_init(&resource->kref);
+ idr_init(&resource->devices);
+ INIT_LIST_HEAD(&resource->connections);
+ list_add_tail_rcu(&resource->resources, &drbd_resources);
+ mutex_init(&resource->conf_update);
+ mutex_init(&resource->adm_mutex);
+ spin_lock_init(&resource->req_lock);
+ return resource;
+
+fail_free_name:
+ kfree(resource->name);
+fail_free_resource:
+ kfree(resource);
+fail:
+ return NULL;
+}
+
+/* caller must be under genl_lock() */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+ if (!connection)
+ return NULL;
+
+ if (drbd_alloc_socket(&connection->data))
+ goto fail;
+ if (drbd_alloc_socket(&connection->meta))
+ goto fail;
+
+ connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!connection->current_epoch)
+ goto fail;
+
+ INIT_LIST_HEAD(&connection->transfer_log);
+
+ INIT_LIST_HEAD(&connection->current_epoch->list);
+ connection->epochs = 1;
+ spin_lock_init(&connection->epoch_lock);
+ connection->write_ordering = WO_bdev_flush;
+
+ connection->send.seen_any_write_yet = false;
+ connection->send.current_epoch_nr = 0;
+ connection->send.current_epoch_writes = 0;
+
+ resource = drbd_create_resource(name);
+ if (!resource)
+ goto fail;
+
+ connection->cstate = C_STANDALONE;
+ mutex_init(&connection->cstate_mutex);
+ init_waitqueue_head(&connection->ping_wait);
+ idr_init(&connection->peer_devices);
+
+ drbd_init_workqueue(&connection->sender_work);
+ mutex_init(&connection->data.mutex);
+ mutex_init(&connection->meta.mutex);
+
+ drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+ connection->receiver.connection = connection;
+ drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+ connection->worker.connection = connection;
+ drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+ connection->asender.connection = connection;
+
+ kref_init(&connection->kref);
+
+ connection->resource = resource;
+
+ if (set_resource_options(resource, res_opts))
+ goto fail_resource;
+
+ kref_get(&resource->kref);
+ list_add_tail_rcu(&connection->connections, &resource->connections);
+ return connection;
+
+fail_resource:
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+fail:
+ kfree(connection->current_epoch);
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection);
+ return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+ struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+ struct drbd_resource *resource = connection->resource;
+
+ if (atomic_read(&connection->current_epoch->epoch_size) != 0)
+ drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+ kfree(connection->current_epoch);
+
+ idr_destroy(&connection->peer_devices);
+
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ kfree(connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+ /* opencoded create_singlethread_workqueue(),
+ * to be able to say "drbd%d", ..., minor */
+ device->submit.wq = alloc_workqueue("drbd%u_submit",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+ if (!device->submit.wq)
+ return -ENOMEM;
+
+ INIT_WORK(&device->submit.worker, do_submit);
+ spin_lock_init(&device->submit.lock);
+ INIT_LIST_HEAD(&device->submit.writes);
+ return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+ struct drbd_resource *resource = adm_ctx->resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;
struct gendisk *disk;
struct request_queue *q;
+ int id;
+ int vnr = adm_ctx->volume;
+ enum drbd_ret_code err = ERR_NOMEM;
+
+ device = minor_to_device(minor);
+ if (device)
+ return ERR_MINOR_EXISTS;
/* GFP_KERNEL, we are outside of all write-out paths */
- mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
- if (!mdev)
- return NULL;
- if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
- goto out_no_cpumask;
+ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+ if (!device)
+ return ERR_NOMEM;
+ kref_init(&device->kref);
- mdev->minor = minor;
+ kref_get(&resource->kref);
+ device->resource = resource;
+ device->minor = minor;
+ device->vnr = vnr;
- drbd_init_set_defaults(mdev);
+ drbd_init_set_defaults(device);
q = blk_alloc_queue(GFP_KERNEL);
if (!q)
goto out_no_q;
- mdev->rq_queue = q;
- q->queuedata = mdev;
+ device->rq_queue = q;
+ q->queuedata = device;
disk = alloc_disk(1);
if (!disk)
goto out_no_disk;
- mdev->vdisk = disk;
+ device->vdisk = disk;
- set_disk_ro(disk, TRUE);
+ set_disk_ro(disk, true);
disk->queue = q;
disk->major = DRBD_MAJOR;
disk->first_minor = minor;
disk->fops = &drbd_ops;
sprintf(disk->disk_name, "drbd%d", minor);
- disk->private_data = mdev;
+ disk->private_data = device;
- mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+ device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
/* we have no partitions. we contain only ourselves. */
- mdev->this_bdev->bd_contains = mdev->this_bdev;
+ device->this_bdev->bd_contains = device->this_bdev;
q->backing_dev_info.congested_fn = drbd_congested;
- q->backing_dev_info.congested_data = mdev;
+ q->backing_dev_info.congested_data = device;
- blk_queue_make_request(q, drbd_make_request_26);
- blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
+ blk_queue_make_request(q, drbd_make_request);
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ /* Setting the max_hw_sectors to an odd value of 8kibyte here
+ This triggers a max_bio_size message upon first attach or connect */
+ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(q, drbd_merge_bvec);
- q->queue_lock = &mdev->req_lock; /* needed since we use */
- /* plugging on a queue, that actually has no requests! */
- q->unplug_fn = drbd_unplug_fn;
+ q->queue_lock = &resource->req_lock;
- mdev->md_io_page = alloc_page(GFP_KERNEL);
- if (!mdev->md_io_page)
+ device->md_io_page = alloc_page(GFP_KERNEL);
+ if (!device->md_io_page)
goto out_no_io_page;
- if (drbd_bm_init(mdev))
+ if (drbd_bm_init(device))
goto out_no_bitmap;
- /* no need to lock access, we are still initializing this minor device. */
- if (!tl_init(mdev))
- goto out_no_tl;
-
- mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
- if (!mdev->app_reads_hash)
- goto out_no_app_reads;
-
- mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
- if (!mdev->current_epoch)
- goto out_no_epoch;
-
- INIT_LIST_HEAD(&mdev->current_epoch->list);
- mdev->epochs = 1;
-
- return mdev;
-
-/* out_whatever_else:
- kfree(mdev->current_epoch); */
-out_no_epoch:
- kfree(mdev->app_reads_hash);
-out_no_app_reads:
- tl_cleanup(mdev);
-out_no_tl:
- drbd_bm_cleanup(mdev);
+ device->read_requests = RB_ROOT;
+ device->write_requests = RB_ROOT;
+
+ id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ err = ERR_MINOR_EXISTS;
+ drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
+ }
+ goto out_no_minor_idr;
+ }
+ kref_get(&device->kref);
+
+ id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ err = ERR_MINOR_EXISTS;
+ drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
+ }
+ goto out_idr_remove_minor;
+ }
+ kref_get(&device->kref);
+
+ INIT_LIST_HEAD(&device->peer_devices);
+ for_each_connection(connection, resource) {
+ peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+ if (!peer_device)
+ goto out_idr_remove_from_resource;
+ peer_device->connection = connection;
+ peer_device->device = device;
+
+ list_add(&peer_device->peer_devices, &device->peer_devices);
+ kref_get(&device->kref);
+
+ id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ err = ERR_INVALID_REQUEST;
+ drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
+ }
+ goto out_idr_remove_from_resource;
+ }
+ kref_get(&connection->kref);
+ }
+
+ if (init_submitter(device)) {
+ err = ERR_NOMEM;
+ drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
+ goto out_idr_remove_vol;
+ }
+
+ add_disk(disk);
+
+ /* inherit the connection state */
+ device->state.conn = first_connection(resource)->cstate;
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ for_each_peer_device(peer_device, device)
+ drbd_connected(peer_device);
+ }
+
+ return NO_ERROR;
+
+out_idr_remove_vol:
+ idr_remove(&connection->peer_devices, vnr);
+out_idr_remove_from_resource:
+ for_each_connection(connection, resource) {
+ peer_device = idr_find(&connection->peer_devices, vnr);
+ if (peer_device) {
+ idr_remove(&connection->peer_devices, vnr);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ }
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ list_del(&peer_device->peer_devices);
+ kfree(peer_device);
+ }
+ idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+ idr_remove(&drbd_devices, minor);
+ synchronize_rcu();
+out_no_minor_idr:
+ drbd_bm_cleanup(device);
out_no_bitmap:
- __free_page(mdev->md_io_page);
+ __free_page(device->md_io_page);
out_no_io_page:
put_disk(disk);
out_no_disk:
blk_cleanup_queue(q);
out_no_q:
- free_cpumask_var(mdev->cpu_mask);
-out_no_cpumask:
- kfree(mdev);
- return NULL;
+ kref_put(&resource->kref, drbd_destroy_resource);
+ kfree(device);
+ return err;
}
-/* counterpart of drbd_new_device.
- * last part of drbd_delete_device. */
-void drbd_free_mdev(struct drbd_conf *mdev)
+void drbd_delete_device(struct drbd_device *device)
{
- kfree(mdev->current_epoch);
- kfree(mdev->app_reads_hash);
- tl_cleanup(mdev);
- if (mdev->bitmap) /* should no longer be there. */
- drbd_bm_cleanup(mdev);
- __free_page(mdev->md_io_page);
- put_disk(mdev->vdisk);
- blk_cleanup_queue(mdev->rq_queue);
- free_cpumask_var(mdev->cpu_mask);
- kfree(mdev);
-}
+ struct drbd_resource *resource = device->resource;
+ struct drbd_connection *connection;
+ int refs = 3;
+ for_each_connection(connection, resource) {
+ idr_remove(&connection->peer_devices, device->vnr);
+ refs++;
+ }
+ idr_remove(&resource->devices, device->vnr);
+ idr_remove(&drbd_devices, device_to_minor(device));
+ del_gendisk(device->vdisk);
+ synchronize_rcu();
+ kref_sub(&device->kref, refs, drbd_destroy_device);
+}
int __init drbd_init(void)
{
int err;
- if (sizeof(struct p_handshake) != 80) {
+ if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
printk(KERN_ERR
- "drbd: never change the size or layout "
- "of the HandShake packet.\n");
- return -EINVAL;
- }
-
- if (1 > minor_count || minor_count > 255) {
- printk(KERN_ERR
- "drbd: invalid minor_count (%d)\n", minor_count);
+ "drbd: invalid minor_count (%d)\n", minor_count);
#ifdef MODULE
return -EINVAL;
#else
- minor_count = 8;
+ minor_count = DRBD_MINOR_COUNT_DEF;
#endif
}
- err = drbd_nl_init();
- if (err)
- return err;
-
err = register_blkdev(DRBD_MAJOR, "drbd");
if (err) {
printk(KERN_ERR
@@ -3325,27 +2908,39 @@ int __init drbd_init(void)
/*
* allocate all necessary structs
*/
- err = -ENOMEM;
-
init_waitqueue_head(&drbd_pp_wait);
drbd_proc = NULL; /* play safe for drbd_cleanup */
- minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
- GFP_KERNEL);
- if (!minor_table)
- goto Enomem;
+ idr_init(&drbd_devices);
+
+ rwlock_init(&global_state_lock);
+ INIT_LIST_HEAD(&drbd_resources);
+
+ err = drbd_genl_register();
+ if (err) {
+ printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+ goto fail;
+ }
err = drbd_create_mempools();
if (err)
- goto Enomem;
+ goto fail;
+ err = -ENOMEM;
drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
printk(KERN_ERR "drbd: unable to register proc file\n");
- goto Enomem;
+ goto fail;
}
- rwlock_init(&global_state_lock);
+ retry.wq = create_singlethread_workqueue("drbd-reissue");
+ if (!retry.wq) {
+ printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+ goto fail;
+ }
+ INIT_WORK(&retry.worker, do_retry);
+ spin_lock_init(&retry.lock);
+ INIT_LIST_HEAD(&retry.writes);
printk(KERN_INFO "drbd: initialized. "
"Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
@@ -3353,14 +2948,12 @@ int __init drbd_init(void)
printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
printk(KERN_INFO "drbd: registered as block device major %d\n",
DRBD_MAJOR);
- printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
return 0; /* Success! */
-Enomem:
+fail:
drbd_cleanup();
if (err == -ENOMEM)
- /* currently always the case */
printk(KERN_ERR "drbd: ran out of memory\n");
else
printk(KERN_ERR "drbd: initialization failure\n");
@@ -3372,58 +2965,54 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
if (ldev == NULL)
return;
- bd_release(ldev->backing_bdev);
- bd_release(ldev->md_bdev);
-
- fput(ldev->lo_file);
- fput(ldev->md_file);
+ blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ kfree(ldev->disk_conf);
kfree(ldev);
}
-void drbd_free_sock(struct drbd_conf *mdev)
+void drbd_free_sock(struct drbd_connection *connection)
{
- if (mdev->data.socket) {
- mutex_lock(&mdev->data.mutex);
- kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
- sock_release(mdev->data.socket);
- mdev->data.socket = NULL;
- mutex_unlock(&mdev->data.mutex);
+ if (connection->data.socket) {
+ mutex_lock(&connection->data.mutex);
+ kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
+ sock_release(connection->data.socket);
+ connection->data.socket = NULL;
+ mutex_unlock(&connection->data.mutex);
}
- if (mdev->meta.socket) {
- mutex_lock(&mdev->meta.mutex);
- kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
- sock_release(mdev->meta.socket);
- mdev->meta.socket = NULL;
- mutex_unlock(&mdev->meta.mutex);
+ if (connection->meta.socket) {
+ mutex_lock(&connection->meta.mutex);
+ kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
+ sock_release(connection->meta.socket);
+ connection->meta.socket = NULL;
+ mutex_unlock(&connection->meta.mutex);
}
}
+/* meta data management */
-void drbd_free_resources(struct drbd_conf *mdev)
+void conn_md_sync(struct drbd_connection *connection)
{
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = NULL;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = NULL;
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = NULL;
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = NULL;
+ struct drbd_peer_device *peer_device;
+ int vnr;
- drbd_free_sock(mdev);
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
- __no_warn(local,
- drbd_free_bc(mdev->ldev);
- mdev->ldev = NULL;);
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_md_sync(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
}
-/* meta data management */
-
+/* aligned 4kByte */
struct meta_data_on_disk {
- u64 la_size; /* last agreed size. */
+ u64 la_size_sect; /* last agreed size. */
u64 uuid[UI_SIZE]; /* UUIDs. */
u64 device_uuid;
u64 reserved_u64_1;
@@ -3431,296 +3020,469 @@ struct meta_data_on_disk {
u32 magic;
u32 md_size_sect;
u32 al_offset; /* offset to this block */
- u32 al_nr_extents; /* important for restoring the AL */
- /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+ u32 al_nr_extents; /* important for restoring the AL (userspace) */
+ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
- u32 reserved_u32[4];
+ u32 la_peer_max_bio_size; /* last peer max_bio_size */
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u8 reserved_u8[4096 - (7*8 + 10*4)];
} __packed;
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+ struct meta_data_on_disk *buffer = b;
+ sector_t sector;
+ int i;
+
+ memset(buffer, 0, sizeof(*buffer));
+
+ buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ buffer->flags = cpu_to_be32(device->ldev->md.flags);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+ buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect);
+ buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset);
+ buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+ buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+ buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+ buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+ buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+ buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+ buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+ D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+ sector = device->ldev->md.md_offset;
+
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ /* this was a try anyways ... */
+ drbd_err(device, "meta data update failed!\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ }
+}
+
/**
* drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
- * @mdev: DRBD device.
+ * @device: DRBD device.
*/
-void drbd_md_sync(struct drbd_conf *mdev)
+void drbd_md_sync(struct drbd_device *device)
{
struct meta_data_on_disk *buffer;
- sector_t sector;
- int i;
- del_timer(&mdev->md_sync_timer);
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+ del_timer(&device->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
- if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+ if (!test_and_clear_bit(MD_DIRTY, &device->flags))
return;
/* We use here D_FAILED and not D_ATTACHING because we try to write
* metadata even if we detach due to a disk failure! */
- if (!get_ldev_if_state(mdev, D_FAILED))
+ if (!get_ldev_if_state(device, D_FAILED))
return;
- mutex_lock(&mdev->md_io_mutex);
- buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
- memset(buffer, 0, 512);
+ buffer = drbd_md_get_buffer(device);
+ if (!buffer)
+ goto out;
- buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
- for (i = UI_CURRENT; i < UI_SIZE; i++)
- buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
- buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
- buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+ drbd_md_write(device, buffer);
- buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
- buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
- buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
- buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
- buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
+ /* Update device->ldev->md.la_size_sect,
+ * since we updated it on metadata. */
+ device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
- buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+ drbd_md_put_buffer(device);
+out:
+ put_ldev(device);
+}
- D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
- sector = mdev->ldev->md.md_offset;
+static int check_activity_log_stripe_size(struct drbd_device *device,
+ struct meta_data_on_disk *on_disk,
+ struct drbd_md *in_core)
+{
+ u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+ u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+ u64 al_size_4k;
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- /* this was a try anyways ... */
- dev_err(DEV, "meta data update failed!\n");
- drbd_chk_io_error(mdev, 1, TRUE);
+ /* both not set: default to old fixed size activity log */
+ if (al_stripes == 0 && al_stripe_size_4k == 0) {
+ al_stripes = 1;
+ al_stripe_size_4k = MD_32kB_SECT/8;
}
- /* Update mdev->ldev->md.la_size_sect,
- * since we updated it on metadata. */
- mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
+ /* some paranoia plausibility checks */
+
+ /* we need both values to be set */
+ if (al_stripes == 0 || al_stripe_size_4k == 0)
+ goto err;
+
+ al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+ /* Upper limit of activity log area, to avoid potential overflow
+ * problems in al_tr_number_to_on_disk_sector(). As right now, more
+ * than 72 * 4k blocks total only increases the amount of history,
+ * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
+ if (al_size_4k > (16 * 1024 * 1024/4))
+ goto err;
+
+ /* Lower limit: we need at least 8 transaction slots (32kB)
+ * to not break existing setups */
+ if (al_size_4k < MD_32kB_SECT/8)
+ goto err;
- mutex_unlock(&mdev->md_io_mutex);
- put_ldev(mdev);
+ in_core->al_stripe_size_4k = al_stripe_size_4k;
+ in_core->al_stripes = al_stripes;
+ in_core->al_size_4k = al_size_4k;
+
+ return 0;
+err:
+ drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+ al_stripes, al_stripe_size_4k);
+ return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+ struct drbd_md *in_core = &bdev->md;
+ s32 on_disk_al_sect;
+ s32 on_disk_bm_sect;
+
+ /* The on-disk size of the activity log, calculated from offsets, and
+ * the size of the activity log calculated from the stripe settings,
+ * should match.
+ * Though we could relax this a bit: it is ok, if the striped activity log
+ * fits in the available on-disk activity log size.
+ * Right now, that would break how resize is implemented.
+ * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+ * of possible unused padding space in the on disk layout. */
+ if (in_core->al_offset < 0) {
+ if (in_core->bm_offset > in_core->al_offset)
+ goto err;
+ on_disk_al_sect = -in_core->al_offset;
+ on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+ } else {
+ if (in_core->al_offset != MD_4kB_SECT)
+ goto err;
+ if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+ on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+ }
+
+ /* old fixed size meta data is exactly that: fixed. */
+ if (in_core->meta_dev_idx >= 0) {
+ if (in_core->md_size_sect != MD_128MB_SECT
+ || in_core->al_offset != MD_4kB_SECT
+ || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+ || in_core->al_stripes != 1
+ || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+ goto err;
+ }
+
+ if (capacity < in_core->md_size_sect)
+ goto err;
+ if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+ goto err;
+
+ /* should be aligned, and at least 32k */
+ if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+ goto err;
+
+ /* should fit (for now: exactly) into the available on-disk space;
+ * overflow prevention is in check_activity_log_stripe_size() above. */
+ if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ /* again, should be aligned */
+ if (in_core->bm_offset & 7)
+ goto err;
+
+ /* FIXME check for device grow with flex external meta data? */
+
+ /* can the available bitmap space cover the last agreed device size? */
+ if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+ goto err;
+
+ return 0;
+
+err:
+ drbd_err(device, "meta data offsets don't make sense: idx=%d "
+ "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+ "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+ in_core->meta_dev_idx,
+ in_core->al_stripes, in_core->al_stripe_size_4k,
+ in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+ (unsigned long long)in_core->la_size_sect,
+ (unsigned long long)capacity);
+
+ return -EINVAL;
}
+
/**
* drbd_md_read() - Reads in the meta data super block
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @bdev: Device from which the meta data should be read in.
*
- * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
- * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
*/
-int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
{
struct meta_data_on_disk *buffer;
+ u32 magic, flags;
int i, rv = NO_ERROR;
- if (!get_ldev_if_state(mdev, D_ATTACHING))
- return ERR_IO_MD_DISK;
+ if (device->state.disk != D_DISKLESS)
+ return ERR_DISK_CONFIGURED;
+
+ buffer = drbd_md_get_buffer(device);
+ if (!buffer)
+ return ERR_NOMEM;
- mutex_lock(&mdev->md_io_mutex);
- buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+ /* First, figure out where our meta data superblock is located,
+ * and read it. */
+ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
- if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
- /* NOTE: cant do normal error processing here as this is
+ if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+ /* NOTE: can't do normal error processing here as this is
called BEFORE disk is attached */
- dev_err(DEV, "Error while reading metadata.\n");
+ drbd_err(device, "Error while reading metadata.\n");
rv = ERR_IO_MD_DISK;
goto err;
}
- if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
- dev_err(DEV, "Error while reading metadata, magic not found.\n");
- rv = ERR_MD_INVALID;
+ magic = be32_to_cpu(buffer->magic);
+ flags = be32_to_cpu(buffer->flags);
+ if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+ (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+ /* btw: that's Activity Log clean, not "all" clean. */
+ drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+ rv = ERR_MD_UNCLEAN;
goto err;
}
- if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
- dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
- be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
- rv = ERR_MD_INVALID;
- goto err;
- }
- if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
- dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
- be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
- rv = ERR_MD_INVALID;
- goto err;
- }
- if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
- dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
- be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
- rv = ERR_MD_INVALID;
+
+ rv = ERR_MD_INVALID;
+ if (magic != DRBD_MD_MAGIC_08) {
+ if (magic == DRBD_MD_MAGIC_07)
+ drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+ else
+ drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
goto err;
}
if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
- dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
- rv = ERR_MD_INVALID;
goto err;
}
- bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
+
+ /* convert to in_core endian */
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
for (i = UI_CURRENT; i < UI_SIZE; i++)
bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
bdev->md.flags = be32_to_cpu(buffer->flags);
- mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
- if (mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
+ bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+ bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+ bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
- err:
- mutex_unlock(&mdev->md_io_mutex);
- put_ldev(mdev);
+ if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+ goto err;
+ if (check_offsets_and_sizes(device, bdev))
+ goto err;
- return rv;
-}
+ if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+ drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+ be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+ goto err;
+ }
+ if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+ drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+ be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+ goto err;
+ }
-static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
-{
- static char *uuid_str[UI_EXTENDED_SIZE] = {
- [UI_CURRENT] = "CURRENT",
- [UI_BITMAP] = "BITMAP",
- [UI_HISTORY_START] = "HISTORY_START",
- [UI_HISTORY_END] = "HISTORY_END",
- [UI_SIZE] = "SIZE",
- [UI_FLAGS] = "FLAGS",
- };
+ rv = NO_ERROR;
- if (index >= UI_EXTENDED_SIZE) {
- dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
- return;
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED) {
+ unsigned int peer;
+ peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+ peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+ device->peer_max_bio_size = peer;
}
+ spin_unlock_irq(&device->resource->req_lock);
- dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
- uuid_str[index],
- (unsigned long long)mdev->ldev->md.uuid[index]);
-}
+ err:
+ drbd_md_put_buffer(device);
+ return rv;
+}
/**
* drbd_md_mark_dirty() - Mark meta data super block as dirty
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Call this function if you change anything that should be written to
* the meta-data super block. This function sets MD_DIRTY, and starts a
* timer that ensures that within five seconds you have to call drbd_md_sync().
*/
#ifdef DEBUG
-void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
+void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
{
- if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
- mod_timer(&mdev->md_sync_timer, jiffies + HZ);
- mdev->last_md_mark_dirty.line = line;
- mdev->last_md_mark_dirty.func = func;
+ if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
+ mod_timer(&device->md_sync_timer, jiffies + HZ);
+ device->last_md_mark_dirty.line = line;
+ device->last_md_mark_dirty.func = func;
}
}
#else
-void drbd_md_mark_dirty(struct drbd_conf *mdev)
+void drbd_md_mark_dirty(struct drbd_device *device)
{
- if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
- mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+ if (!test_and_set_bit(MD_DIRTY, &device->flags))
+ mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
}
#endif
-static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
{
int i;
- for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
- mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
- debug_drbd_uuid(mdev, i+1);
- }
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
}
-void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
{
if (idx == UI_CURRENT) {
- if (mdev->state.role == R_PRIMARY)
+ if (device->state.role == R_PRIMARY)
val |= 1;
else
val &= ~((u64)1);
- drbd_set_ed_uuid(mdev, val);
+ drbd_set_ed_uuid(device, val);
}
- mdev->ldev->md.uuid[idx] = val;
- debug_drbd_uuid(mdev, idx);
- drbd_md_mark_dirty(mdev);
+ device->ldev->md.uuid[idx] = val;
+ drbd_md_mark_dirty(device);
}
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
-void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
{
- if (mdev->ldev->md.uuid[idx]) {
- drbd_uuid_move_history(mdev);
- mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
- debug_drbd_uuid(mdev, UI_HISTORY_START);
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ if (device->ldev->md.uuid[idx]) {
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
}
- _drbd_uuid_set(mdev, idx, val);
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
}
/**
* drbd_uuid_new_current() - Creates a new current UUID
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Creates a new current UUID, and rotates the old current UUID into
* the bitmap slot. Causes an incremental resync upon next connect.
*/
-void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
{
u64 val;
-
- dev_info(DEV, "Creating new current UUID\n");
- D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
- mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
- debug_drbd_uuid(mdev, UI_BITMAP);
+ unsigned long long bm_uuid;
get_random_bytes(&val, sizeof(u64));
- _drbd_uuid_set(mdev, UI_CURRENT, val);
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+ device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+ __drbd_uuid_set(device, UI_CURRENT, val);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ drbd_print_uuids(device, "new current UUID");
/* get it to stable storage _now_ */
- drbd_md_sync(mdev);
+ drbd_md_sync(device);
}
-void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
{
- if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+ unsigned long flags;
+ if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
return;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
if (val == 0) {
- drbd_uuid_move_history(mdev);
- mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
- mdev->ldev->md.uuid[UI_BITMAP] = 0;
- debug_drbd_uuid(mdev, UI_HISTORY_START);
- debug_drbd_uuid(mdev, UI_BITMAP);
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
} else {
- if (mdev->ldev->md.uuid[UI_BITMAP])
- dev_warn(DEV, "bm UUID already set");
+ unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
- mdev->ldev->md.uuid[UI_BITMAP] = val;
- mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
-
- debug_drbd_uuid(mdev, UI_BITMAP);
+ device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
}
- drbd_md_mark_dirty(mdev);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+ drbd_md_mark_dirty(device);
}
/**
* drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Sets all bits in the bitmap and writes the whole bitmap to stable storage.
*/
-int drbd_bmio_set_n_write(struct drbd_conf *mdev)
+int drbd_bmio_set_n_write(struct drbd_device *device)
{
int rv = -EIO;
- if (get_ldev_if_state(mdev, D_ATTACHING)) {
- drbd_md_set_flag(mdev, MDF_FULL_SYNC);
- drbd_md_sync(mdev);
- drbd_bm_set_all(mdev);
+ if (get_ldev_if_state(device, D_ATTACHING)) {
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ drbd_bm_set_all(device);
- rv = drbd_bm_write(mdev);
+ rv = drbd_bm_write(device);
if (!rv) {
- drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
- drbd_md_sync(mdev);
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
}
- put_ldev(mdev);
+ put_ldev(device);
}
return rv;
@@ -3728,85 +3490,111 @@ int drbd_bmio_set_n_write(struct drbd_conf *mdev)
/**
* drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Clears all bits in the bitmap and writes the whole bitmap to stable storage.
*/
-int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+int drbd_bmio_clear_n_write(struct drbd_device *device)
{
int rv = -EIO;
- drbd_resume_al(mdev);
- if (get_ldev_if_state(mdev, D_ATTACHING)) {
- drbd_bm_clear_all(mdev);
- rv = drbd_bm_write(mdev);
- put_ldev(mdev);
+ drbd_resume_al(device);
+ if (get_ldev_if_state(device, D_ATTACHING)) {
+ drbd_bm_clear_all(device);
+ rv = drbd_bm_write(device);
+ put_ldev(device);
}
return rv;
}
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_bitmap_io(struct drbd_work *w, int unused)
{
- struct bm_io_work *work = container_of(w, struct bm_io_work, w);
- int rv;
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, bm_io_work.w);
+ struct bm_io_work *work = &device->bm_io_work;
+ int rv = -EIO;
- D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
+ D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
- drbd_bm_lock(mdev, work->why);
- rv = work->io_fn(mdev);
- drbd_bm_unlock(mdev);
+ if (get_ldev(device)) {
+ drbd_bm_lock(device, work->why, work->flags);
+ rv = work->io_fn(device);
+ drbd_bm_unlock(device);
+ put_ldev(device);
+ }
- clear_bit(BITMAP_IO, &mdev->flags);
- wake_up(&mdev->misc_wait);
+ clear_bit_unlock(BITMAP_IO, &device->flags);
+ wake_up(&device->misc_wait);
if (work->done)
- work->done(mdev, rv);
+ work->done(device, rv);
- clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+ clear_bit(BITMAP_IO_QUEUED, &device->flags);
work->why = NULL;
+ work->flags = 0;
- return 1;
+ return 0;
}
-void drbd_ldev_destroy(struct drbd_conf *mdev)
+void drbd_ldev_destroy(struct drbd_device *device)
{
- lc_destroy(mdev->resync);
- mdev->resync = NULL;
- lc_destroy(mdev->act_log);
- mdev->act_log = NULL;
+ lc_destroy(device->resync);
+ device->resync = NULL;
+ lc_destroy(device->act_log);
+ device->act_log = NULL;
__no_warn(local,
- drbd_free_bc(mdev->ldev);
- mdev->ldev = NULL;);
+ drbd_free_bc(device->ldev);
+ device->ldev = NULL;);
- if (mdev->md_io_tmpp) {
- __free_page(mdev->md_io_tmpp);
- mdev->md_io_tmpp = NULL;
- }
- clear_bit(GO_DISKLESS, &mdev->flags);
+ clear_bit(GO_DISKLESS, &device->flags);
}
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_go_diskless(struct drbd_work *w, int unused)
{
- D_ASSERT(mdev->state.disk == D_FAILED);
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, go_diskless);
+
+ D_ASSERT(device, device->state.disk == D_FAILED);
/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
* inc/dec it frequently. Once we are D_DISKLESS, no one will touch
* the protected members anymore, though, so once put_ldev reaches zero
* again, it will be safe to free them. */
- drbd_force_state(mdev, NS(disk, D_DISKLESS));
- return 1;
-}
-void drbd_go_diskless(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->state.disk == D_FAILED);
- if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
- drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+ /* Try to write changed bitmap pages, read errors may have just
+ * set some bits outside the area covered by the activity log.
+ *
+ * If we have an IO error during the bitmap writeout,
+ * we will want a full sync next time, just in case.
+ * (Do we want a specific meta data flag for this?)
+ *
+ * If that does not make it to stable storage either,
+ * we cannot do anything about that anymore.
+ *
+ * We still need to check if both bitmap and ldev are present, we may
+ * end up here after a failed attach, before ldev was even assigned.
+ */
+ if (device->bitmap && device->ldev) {
+ /* An interrupted resync or similar is allowed to recounts bits
+ * while we detach.
+ * Any modifications would not be expected anymore, though.
+ */
+ if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+ "detach", BM_LOCKED_TEST_ALLOWED)) {
+ if (test_bit(WAS_READ_ERROR, &device->flags)) {
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+ }
+ }
+
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+ return 0;
}
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @io_fn: IO callback to be called when bitmap IO is possible
* @done: callback to be called after the bitmap IO was performed
* @why: Descriptive text of the reason for doing the IO
@@ -3816,73 +3604,77 @@ void drbd_go_diskless(struct drbd_conf *mdev)
* called from worker context. It MUST NOT be used while a previous such
* work is still pending!
*/
-void drbd_queue_bitmap_io(struct drbd_conf *mdev,
- int (*io_fn)(struct drbd_conf *),
- void (*done)(struct drbd_conf *, int),
- char *why)
-{
- D_ASSERT(current == mdev->worker.task);
-
- D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
- D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
- D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
- if (mdev->bm_io_work.why)
- dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
- why, mdev->bm_io_work.why);
-
- mdev->bm_io_work.io_fn = io_fn;
- mdev->bm_io_work.done = done;
- mdev->bm_io_work.why = why;
-
- set_bit(BITMAP_IO, &mdev->flags);
- if (atomic_read(&mdev->ap_bio_cnt) == 0) {
- if (list_empty(&mdev->bm_io_work.w.list)) {
- set_bit(BITMAP_IO_QUEUED, &mdev->flags);
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
- } else
- dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
+void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags)
+{
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+ D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+ D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+ if (device->bm_io_work.why)
+ drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+ why, device->bm_io_work.why);
+
+ device->bm_io_work.io_fn = io_fn;
+ device->bm_io_work.done = done;
+ device->bm_io_work.why = why;
+ device->bm_io_work.flags = flags;
+
+ spin_lock_irq(&device->resource->req_lock);
+ set_bit(BITMAP_IO, &device->flags);
+ if (atomic_read(&device->ap_bio_cnt) == 0) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->bm_io_work.w);
}
+ spin_unlock_irq(&device->resource->req_lock);
}
/**
* drbd_bitmap_io() - Does an IO operation on the whole bitmap
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @io_fn: IO callback to be called when bitmap IO is possible
* @why: Descriptive text of the reason for doing the IO
*
* freezes application IO while that the actual IO operations runs. This
* functions MAY NOT be called from worker context.
*/
-int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
{
int rv;
- D_ASSERT(current != mdev->worker.task);
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
- drbd_suspend_io(mdev);
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_suspend_io(device);
- drbd_bm_lock(mdev, why);
- rv = io_fn(mdev);
- drbd_bm_unlock(mdev);
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
- drbd_resume_io(mdev);
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_resume_io(device);
return rv;
}
-void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
{
- if ((mdev->ldev->md.flags & flag) != flag) {
- drbd_md_mark_dirty(mdev);
- mdev->ldev->md.flags |= flag;
+ if ((device->ldev->md.flags & flag) != flag) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags |= flag;
}
}
-void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
{
- if ((mdev->ldev->md.flags & flag) != 0) {
- drbd_md_mark_dirty(mdev);
- mdev->ldev->md.flags &= ~flag;
+ if ((device->ldev->md.flags & flag) != 0) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags &= ~flag;
}
}
int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
@@ -3892,20 +3684,131 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
static void md_sync_timer_fn(unsigned long data)
{
- struct drbd_conf *mdev = (struct drbd_conf *) data;
+ struct drbd_device *device = (struct drbd_device *) data;
- drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+ /* must not double-queue! */
+ if (list_empty(&device->md_sync_work.list))
+ drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+ &device->md_sync_work);
}
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_md_sync(struct drbd_work *w, int unused)
{
- dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, md_sync_work);
+
+ drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
#ifdef DEBUG
- dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
- mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
+ drbd_warn(device, "last md_mark_dirty: %s:%u\n",
+ device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
#endif
- drbd_md_sync(mdev);
- return 1;
+ drbd_md_sync(device);
+ return 0;
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_SUPERSEDED] = "Superseded",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_CSUM_RS_REQUEST] = "CsumRSRequest",
+ [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
+ [P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
+ [P_OUT_OF_SYNC] = "OutOfSync",
+ [P_RETRY_WRITE] = "RetryWrite",
+ [P_RS_CANCEL] = "RSCancel",
+ [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
+ [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
+ [P_RETRY_WRITE] = "retry_write",
+ [P_PROTOCOL_UPDATE] = "protocol_update",
+
+ /* enum drbd_packet, but not commands - obsoleted flags:
+ * P_MAY_IGNORE
+ * P_MAX_OPT_CMD
+ */
+ };
+
+ /* too big for the array: 0xfffX */
+ if (cmd == P_INITIAL_META)
+ return "InitialMeta";
+ if (cmd == P_INITIAL_DATA)
+ return "InitialData";
+ if (cmd == P_CONNECTION_FEATURES)
+ return "ConnectionFeatures";
+ if (cmd >= ARRAY_SIZE(cmdnames))
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc - wait for a request to make progress
+ * @device: device associated with the request
+ * @i: the struct drbd_interval embedded in struct drbd_request or
+ * struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ long timeout;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -ETIMEDOUT;
+ }
+ timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+ rcu_read_unlock();
+
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ timeout = schedule_timeout(timeout);
+ finish_wait(&device->misc_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
+ if (!timeout || device->state.conn < C_CONNECTED)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
}
#ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -3957,20 +3860,20 @@ _drbd_fault_str(unsigned int type) {
}
unsigned int
-_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
{
static struct fault_random_state rrs = {0, 0};
unsigned int ret = (
(fault_devs == 0 ||
- ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
+ ((1 << device_to_minor(device)) & fault_devs) != 0) &&
(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
if (ret) {
fault_count++;
if (__ratelimit(&drbd_ratelimit_state))
- dev_warn(DEV, "***Simulating %s failure\n",
+ drbd_warn(device, "***Simulating %s failure\n",
_drbd_fault_str(type));
}
@@ -3986,12 +3889,11 @@ const char *drbd_buildtag(void)
static char buildtag[38] = "\0uilt-in";
if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
- if (THIS_MODULE != NULL)
- sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
- else
+#ifdef MODULE
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+ buildtag[0] = 'b';
#endif
- buildtag[0] = 'b';
}
return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 29e5c70e4e2..3f2e1673808 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -29,165 +29,339 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
-#include <linux/connector.h>
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
+#include "drbd_protocol.h"
#include "drbd_req.h"
-#include "drbd_wrappers.h"
#include <asm/unaligned.h>
-#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
-#include <linux/compiler.h>
#include <linux/kthread.h>
-static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
-static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
-static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
-
-/* see get_sb_bdev and bd_claim */
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
-/* Generate the tag_list to struct functions */
-#define NL_PACKET(name, number, fields) \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) \
-{ \
- int tag; \
- int dlen; \
- \
- while ((tag = get_unaligned(tags++)) != TT_END) { \
- dlen = get_unaligned(tags++); \
- switch (tag_number(tag)) { \
- fields \
- default: \
- if (tag & T_MANDATORY) { \
- dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
- return 0; \
- } \
- } \
- tags = (unsigned short *)((char *)tags + dlen); \
- } \
- return 1; \
-}
-#define NL_INTEGER(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
- arg->member = get_unaligned((int *)(tags)); \
- break;
-#define NL_INT64(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
- arg->member = get_unaligned((u64 *)(tags)); \
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ printk(KERN_ERR "drbd: error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
+
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR 1
+#define DRBD_ADM_NEED_RESOURCE 2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+ struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx->reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx->reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh->minor = d_in->minor;
+ adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx->volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx->reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx->volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx->resource_name = nla_data(nla);
+ adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx->my_addr &&
+ nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+ (adm_ctx->peer_addr &&
+ nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx->minor = d_in->minor;
+ adm_ctx->device = minor_to_device(d_in->minor);
+
+ /* We are protected by the global genl_lock().
+ * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+ * so make sure this object stays around. */
+ if (adm_ctx->device)
+ kref_get(&adm_ctx->device->kref);
+
+ if (adm_ctx->resource_name) {
+ adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+ }
+
+ if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+ if (adm_ctx->resource_name)
+ return ERR_RES_NOT_KNOWN;
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx->resource) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->my_addr && adm_ctx->peer_addr)
+ adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+ nla_len(adm_ctx->my_addr),
+ nla_data(adm_ctx->peer_addr),
+ nla_len(adm_ctx->peer_addr));
+ if (!adm_ctx->connection) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx->device && adm_ctx->resource &&
+ adm_ctx->device->resource != adm_ctx->resource) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+ adm_ctx->minor, adm_ctx->resource->name,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device &&
+ adm_ctx->volume != VOLUME_UNSPECIFIED &&
+ adm_ctx->volume != adm_ctx->device->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx->minor, adm_ctx->volume,
+ adm_ctx->device->vnr,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ /* still, provide adm_ctx->resource always, if possible. */
+ if (!adm_ctx->resource) {
+ adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+ : adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+ if (adm_ctx->resource)
+ kref_get(&adm_ctx->resource->kref);
+ }
+
+ return NO_ERROR;
+
+fail:
+ nlmsg_free(adm_ctx->reply_skb);
+ adm_ctx->reply_skb = NULL;
+ return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+ struct genl_info *info, int retcode)
+{
+ if (adm_ctx->device) {
+ kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+ adm_ctx->device = NULL;
+ }
+ if (adm_ctx->connection) {
+ kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+ adm_ctx->connection = NULL;
+ }
+ if (adm_ctx->resource) {
+ kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+ adm_ctx->resource = NULL;
+ }
+
+ if (!adm_ctx->reply_skb)
+ return -ENOMEM;
+
+ adm_ctx->reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx->reply_skb, info);
+ return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
break;
-#define NL_BIT(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
- arg->member = *(char *)(tags) ? 1 : 0; \
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
break;
-#define NL_STRING(pn, pr, member, len) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
- if (dlen > len) { \
- dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
- #member, dlen, (unsigned int)len); \
- return 0; \
- } \
- arg->member ## _len = dlen; \
- memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
- break;
-#include "linux/drbd_nl.h"
-
-/* Generate the struct to tag_list functions */
-#define NL_PACKET(name, number, fields) \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) \
-{ \
- fields \
- return tags; \
-}
-
-#define NL_INTEGER(pn, pr, member) \
- put_unaligned(pn | pr | TT_INTEGER, tags++); \
- put_unaligned(sizeof(int), tags++); \
- put_unaligned(arg->member, (int *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(int));
-#define NL_INT64(pn, pr, member) \
- put_unaligned(pn | pr | TT_INT64, tags++); \
- put_unaligned(sizeof(u64), tags++); \
- put_unaligned(arg->member, (u64 *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(u64));
-#define NL_BIT(pn, pr, member) \
- put_unaligned(pn | pr | TT_BIT, tags++); \
- put_unaligned(sizeof(char), tags++); \
- *(char *)tags = arg->member; \
- tags = (unsigned short *)((char *)tags+sizeof(char));
-#define NL_STRING(pn, pr, member, len) \
- put_unaligned(pn | pr | TT_STRING, tags++); \
- put_unaligned(arg->member ## _len, tags++); \
- memcpy(tags, arg->member, arg->member ## _len); \
- tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include "linux/drbd_nl.h"
-
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
-void drbd_nl_send_reply(struct cn_msg *, int);
-
-int drbd_khelper(struct drbd_conf *mdev, char *cmd)
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL, /* Will be set to address family */
- NULL, /* Will be set to address */
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
NULL };
-
- char mb[12], af[20], ad[60], *afs;
+ char mb[12];
char *argv[] = {usermode_helper, cmd, mb, NULL };
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct sib_info sib;
int ret;
- snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
+ if (current == connection->worker.task)
+ set_bit(CALLBACK_PENDING, &connection->flags);
- if (get_net_conf(mdev)) {
- switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
- case AF_INET6:
- afs = "ipv6";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
- &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
- break;
- case AF_INET:
- afs = "ipv4";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- break;
- default:
- afs = "ssocks";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- }
- snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
- envp[3]=af;
- envp[4]=ad;
- put_net_conf(mdev);
- }
+ snprintf(mb, 12, "minor-%d", device_to_minor(device));
+ setup_khelper_env(connection, envp);
/* The helper may take some time.
* write out any unsynced meta data changes now */
- drbd_md_sync(mdev);
-
- dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+ drbd_md_sync(device);
- drbd_bcast_ev_helper(mdev, cmd);
- ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+ drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(device, &sib);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
- dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+ drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
else
- dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+ drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(device, &sib);
+
+ if (current == connection->worker.task)
+ clear_bit(CALLBACK_PENDING, &connection->flags);
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
@@ -195,322 +369,429 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
return ret;
}
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+static int conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *resource_name = connection->resource->name;
+ char *argv[] = {usermode_helper, cmd, resource_name, NULL };
+ int ret;
+
+ setup_khelper_env(connection, envp);
+ conn_md_sync(connection);
+
+ drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
+ /* TODO: conn_bcast_event() ?? */
+
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ else
+ drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+ enum drbd_fencing_p fp = FP_NOT_AVAIL;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev_if_state(device, D_CONSISTENT)) {
+ struct disk_conf *disk_conf =
+ rcu_dereference(peer_device->device->ldev->disk_conf);
+ fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+ put_ldev(device);
+ }
+ }
+ rcu_read_unlock();
+
+ if (fp == FP_NOT_AVAIL) {
+ /* IO Suspending works on the whole resource.
+ Do it only for one device. */
+ vnr = 0;
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+ }
+
+ return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
{
+ unsigned int connect_cnt;
+ union drbd_state mask = { };
+ union drbd_state val = { };
+ enum drbd_fencing_p fp;
char *ex_to_string;
int r;
- enum drbd_disk_state nps;
- enum drbd_fencing_p fp;
- D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+ spin_unlock_irq(&connection->resource->req_lock);
+ return false;
+ }
- if (get_ldev_if_state(mdev, D_CONSISTENT)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- } else {
- dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
- nps = mdev->state.pdsk;
+ connect_cnt = connection->connect_cnt;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ fp = highest_fencing_policy(connection);
+ switch (fp) {
+ case FP_NOT_AVAIL:
+ drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
goto out;
+ case FP_DONT_CARE:
+ return true;
+ default: ;
}
- r = drbd_khelper(mdev, "fence-peer");
+ r = conn_khelper(connection, "fence-peer");
switch ((r>>8) & 0xff) {
case 3: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
- nps = D_INCONSISTENT;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_INCONSISTENT;
break;
case 4: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
case 5: /* peer was down */
- if (mdev->state.disk == D_UP_TO_DATE) {
+ if (conn_highest_disk(connection) == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
} else {
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
- nps = mdev->state.pdsk;
}
break;
case 6: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
- dev_warn(DEV, "Peer is primary, outdating myself.\n");
- nps = D_UNKNOWN;
- _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+ drbd_warn(connection, "Peer is primary, outdating myself.\n");
+ mask.disk = D_MASK;
+ val.disk = D_OUTDATED;
break;
case 7:
if (fp != FP_STONITH)
- dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
default:
/* The script is broken ... */
- nps = D_UNKNOWN;
- dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
- return nps;
+ drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return false; /* Eventually leave IO frozen */
}
- dev_info(DEV, "fence-peer helper returned %d (%s)\n",
- (r>>8) & 0xff, ex_to_string);
+ drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
-out:
- if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
- /* The handler was not successful... unfreeze here, the
- state engine can not unfreeze... */
- _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
+ out:
+
+ /* Not using
+ conn_request_state(connection, mask, val, CS_VERBOSE);
+ here, because we might were able to re-establish the connection in the
+ meantime. */
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+ if (connection->connect_cnt != connect_cnt)
+ /* In case the connection was established and droped
+ while the fence-peer handler was running, ignore it */
+ drbd_info(connection, "Ignoring fence-peer exit code\n");
+ else
+ _conn_request_state(connection, mask, val, CS_VERBOSE);
}
+ spin_unlock_irq(&connection->resource->req_lock);
- return nps;
+ return conn_highest_pdsk(connection) <= D_OUTDATED;
}
static int _try_outdate_peer_async(void *data)
{
- struct drbd_conf *mdev = (struct drbd_conf *)data;
- enum drbd_disk_state nps;
+ struct drbd_connection *connection = (struct drbd_connection *)data;
- nps = drbd_try_outdate_peer(mdev);
- drbd_request_state(mdev, NS(pdsk, nps));
+ conn_try_outdate_peer(connection);
+ kref_put(&connection->kref, drbd_destroy_connection);
return 0;
}
-void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
{
struct task_struct *opa;
- opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
- if (IS_ERR(opa))
- dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+ kref_get(&connection->kref);
+ /* We may just have force_sig()'ed this thread
+ * to get it out of some blocking network function.
+ * Clear signals; otherwise kthread_run(), which internally uses
+ * wait_on_completion_killable(), will mistake our pending signal
+ * for a new fatal signal and fail. */
+ flush_signals(current);
+ opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+ if (IS_ERR(opa)) {
+ drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
}
-int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
{
const int max_tries = 4;
- int r = 0;
+ enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+ struct net_conf *nc;
int try = 0;
int forced = 0;
union drbd_state mask, val;
- enum drbd_disk_state nps;
- if (new_role == R_PRIMARY)
- request_ping(mdev); /* Detect a dead peer ASAP */
+ if (new_role == R_PRIMARY) {
+ struct drbd_connection *connection;
+
+ /* Detect dead peers as soon as possible. */
+
+ rcu_read_lock();
+ for_each_connection(connection, device->resource)
+ request_ping(connection);
+ rcu_read_unlock();
+ }
- mutex_lock(&mdev->state_mutex);
+ mutex_lock(device->state_mutex);
mask.i = 0; mask.role = R_MASK;
val.i = 0; val.role = new_role;
while (try++ < max_tries) {
- r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+ rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
/* in case we first succeeded to outdate,
* but now suddenly could establish a connection */
- if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+ if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
val.pdsk = 0;
mask.pdsk = 0;
continue;
}
- if (r == SS_NO_UP_TO_DATE_DISK && force &&
- (mdev->state.disk < D_UP_TO_DATE &&
- mdev->state.disk >= D_INCONSISTENT)) {
+ if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+ (device->state.disk < D_UP_TO_DATE &&
+ device->state.disk >= D_INCONSISTENT)) {
mask.disk = D_MASK;
val.disk = D_UP_TO_DATE;
forced = 1;
continue;
}
- if (r == SS_NO_UP_TO_DATE_DISK &&
- mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
- D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
- nps = drbd_try_outdate_peer(mdev);
+ if (rv == SS_NO_UP_TO_DATE_DISK &&
+ device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+ D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
- if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+ if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
val.disk = D_UP_TO_DATE;
mask.disk = D_MASK;
}
-
- val.pdsk = nps;
- mask.pdsk = D_MASK;
-
continue;
}
- if (r == SS_NOTHING_TO_DO)
- goto fail;
- if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
- nps = drbd_try_outdate_peer(mdev);
+ if (rv == SS_NOTHING_TO_DO)
+ goto out;
+ if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+ if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+ drbd_warn(device, "Forced into split brain situation!\n");
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
- if (force && nps > D_OUTDATED) {
- dev_warn(DEV, "Forced into split brain situation!\n");
- nps = D_OUTDATED;
}
-
- mask.pdsk = D_MASK;
- val.pdsk = nps;
-
continue;
}
- if (r == SS_TWO_PRIMARIES) {
+ if (rv == SS_TWO_PRIMARIES) {
/* Maybe the peer is detected as dead very soon...
retry at most once more in this case. */
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
+ int timeo;
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
if (try < max_tries)
try = max_tries - 1;
continue;
}
- if (r < SS_SUCCESS) {
- r = _drbd_request_state(mdev, mask, val,
+ if (rv < SS_SUCCESS) {
+ rv = _drbd_request_state(device, mask, val,
CS_VERBOSE + CS_WAIT_COMPLETE);
- if (r < SS_SUCCESS)
- goto fail;
+ if (rv < SS_SUCCESS)
+ goto out;
}
break;
}
- if (r < SS_SUCCESS)
- goto fail;
+ if (rv < SS_SUCCESS)
+ goto out;
if (forced)
- dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
+ drbd_warn(device, "Forced to consider local data as UpToDate!\n");
/* Wait until nothing is on the fly :) */
- wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+ wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+ /* FIXME also wait for all pending P_BARRIER_ACK? */
if (new_role == R_SECONDARY) {
- set_disk_ro(mdev->vdisk, TRUE);
- if (get_ldev(mdev)) {
- mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
- put_ldev(mdev);
+ set_disk_ro(device->vdisk, true);
+ if (get_ldev(device)) {
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+ put_ldev(device);
}
} else {
- if (get_net_conf(mdev)) {
- mdev->net_conf->want_lose = 0;
- put_net_conf(mdev);
- }
- set_disk_ro(mdev->vdisk, FALSE);
- if (get_ldev(mdev)) {
- if (((mdev->state.conn < C_CONNECTED ||
- mdev->state.pdsk <= D_FAILED)
- && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
- drbd_uuid_new_current(mdev);
-
- mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
- put_ldev(mdev);
+ /* Called from drbd_adm_set_role only.
+ * We are still holding the conf_update mutex. */
+ nc = first_peer_device(device)->connection->net_conf;
+ if (nc)
+ nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+
+ set_disk_ro(device->vdisk, false);
+ if (get_ldev(device)) {
+ if (((device->state.conn < C_CONNECTED ||
+ device->state.pdsk <= D_FAILED)
+ && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+ drbd_uuid_new_current(device);
+
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ put_ldev(device);
}
}
- if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
- drbd_al_to_on_disk_bm(mdev);
- put_ldev(mdev);
- }
+ /* writeout of activity log covered areas of the bitmap
+ * to stable storage done in after state change already */
- if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
/* if this was forced, we should consider sync */
if (forced)
- drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_current_state(first_peer_device(device));
}
- drbd_md_sync(mdev);
+ drbd_md_sync(device);
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- fail:
- mutex_unlock(&mdev->state_mutex);
- return r;
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+ mutex_unlock(device->state_mutex);
+ return rv;
}
-static struct drbd_conf *ensure_mdev(int minor, int create)
+static const char *from_attrs_err_to_txt(int err)
{
- struct drbd_conf *mdev;
-
- if (minor >= minor_count)
- return NULL;
-
- mdev = minor_to_mdev(minor);
-
- if (!mdev && create) {
- struct gendisk *disk = NULL;
- mdev = drbd_new_device(minor);
-
- spin_lock_irq(&drbd_pp_lock);
- if (minor_table[minor] == NULL) {
- minor_table[minor] = mdev;
- disk = mdev->vdisk;
- mdev = NULL;
- } /* else: we lost the race */
- spin_unlock_irq(&drbd_pp_lock);
-
- if (disk) /* we won the race above */
- /* in case we ever add a drbd_delete_device(),
- * don't forget the del_gendisk! */
- add_disk(disk);
- else /* we lost the race above */
- drbd_free_mdev(mdev);
-
- mdev = minor_to_mdev(minor);
- }
-
- return mdev;
+ return err == -ENOMSG ? "required attribute missing" :
+ err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+ err == -EEXIST ? "can not change invariant setting" :
+ "invalid attribute value";
}
-static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
{
- struct primary primary_args;
-
- memset(&primary_args, 0, sizeof(struct primary));
- if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
- }
+ struct drbd_config_context adm_ctx;
+ struct set_role_parms parms;
+ int err;
+ enum drbd_ret_code retcode;
- reply->ret_code =
- drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- return 0;
-}
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+ err = set_role_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ genl_unlock();
+ mutex_lock(&adm_ctx.resource->adm_mutex);
-static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+ if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+ retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+ else
+ retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ genl_lock();
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-/* initializes the md.*_offset members, so we are able to find
- * the on disk meta data */
-static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * Activity log size used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
- switch (bdev->dc.meta_dev_idx) {
+ unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+ bdev->md.md_offset = drbd_md_ss(bdev);
+
+ switch (bdev->md.meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
- bdev->md.md_size_sect = MD_RESERVED_SECT;
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.md_size_sect = MD_128MB_SECT;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_FLEX_EXT:
/* just occupy the full device; unit: sectors */
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
- bdev->md.md_offset = 0;
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
- bdev->md.al_offset = -MD_AL_MAX_SIZE;
+ bdev->md.al_offset = -al_size_sect;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -519,26 +800,28 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
/* plus the "drbd meta data super block",
* and the activity log; */
- md_size_sect += MD_BM_OFFSET;
+ md_size_sect += MD_4kB_SECT + al_size_sect;
bdev->md.md_size_sect = md_size_sect;
/* bitmap offset is adjusted by 'super' block size */
- bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
+ bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
break;
}
}
+/* input size is expected to be in KB */
char *ppsize(char *buf, unsigned long long size)
{
- /* Needs 9 bytes at max. */
+ /* Needs 9 bytes at max including trailing NUL:
+ * -1ULL ==> "16384 EB" */
static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
int base = 0;
- while (size >= 10000) {
+ while (size >= 10000 && base < sizeof(units)-1) {
/* shift + round */
size = (size >> 10) + !!(size & (1<<9));
base++;
}
- sprintf(buf, "%lu %cB", (long)size, units[base]);
+ sprintf(buf, "%u %cB", (unsigned)size, units[base]);
return buf;
}
@@ -556,36 +839,47 @@ char *ppsize(char *buf, unsigned long long size)
* R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
* peer may not initiate a resize.
*/
-void drbd_suspend_io(struct drbd_conf *mdev)
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+void drbd_suspend_io(struct drbd_device *device)
{
- set_bit(SUSPEND_IO, &mdev->flags);
- if (is_susp(mdev->state))
+ set_bit(SUSPEND_IO, &device->flags);
+ if (drbd_suspended(device))
return;
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
}
-void drbd_resume_io(struct drbd_conf *mdev)
+void drbd_resume_io(struct drbd_device *device)
{
- clear_bit(SUSPEND_IO, &mdev->flags);
- wake_up(&mdev->misc_wait);
+ clear_bit(SUSPEND_IO, &device->flags);
+ wake_up(&device->misc_wait);
}
/**
* drbd_determine_dev_size() - Sets the right device size obeying all constraints
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Returns 0 on success, negative return values indicate errors.
* You should call drbd_md_sync() after calling this function.
*/
-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
- sector_t la_size;
+ sector_t la_size_sect, u_size;
+ struct drbd_md *md = &device->ldev->md;
+ u32 prev_al_stripe_size_4k;
+ u32 prev_al_stripes;
sector_t size;
char ppb[10];
+ void *buffer;
int md_moved, la_size_changed;
- enum determine_dev_size rv = unchanged;
+ enum determine_dev_size rv = DS_UNCHANGED;
/* race:
* application request passes inc_ap_bio,
@@ -596,93 +890,153 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_
* Suspend IO right here.
* still lock the act_log to not trigger ASSERTs there.
*/
- drbd_suspend_io(mdev);
+ drbd_suspend_io(device);
+ buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+ if (!buffer) {
+ drbd_resume_io(device);
+ return DS_ERROR;
+ }
/* no wait necessary anymore, actually we could assert that */
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+
+ prev_first_sect = drbd_md_first_sector(device->ldev);
+ prev_size = device->ldev->md.md_size_sect;
+ la_size_sect = device->ldev->md.la_size_sect;
- prev_first_sect = drbd_md_first_sector(mdev->ldev);
- prev_size = mdev->ldev->md.md_size_sect;
- la_size = mdev->ldev->md.la_size_sect;
+ if (rs) {
+ /* rs is non NULL if we should change the AL layout only */
- /* TODO: should only be some assert here, not (re)init... */
- drbd_md_set_sector_offsets(mdev, mdev->ldev);
+ prev_al_stripes = md->al_stripes;
+ prev_al_stripe_size_4k = md->al_stripe_size_4k;
- size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+ md->al_stripes = rs->al_stripes;
+ md->al_stripe_size_4k = rs->al_stripe_size / 4;
+ md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+ }
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+ if (size < la_size_sect) {
+ if (rs && u_size == 0) {
+ /* Remove "rs &&" later. This check should always be active, but
+ right now the receiver expects the permissive behavior */
+ drbd_warn(device, "Implicit shrink not allowed. "
+ "Use --size=%llus for explicit shrink.\n",
+ (unsigned long long)size);
+ rv = DS_ERROR_SHRINK;
+ }
+ if (u_size > size)
+ rv = DS_ERROR_SPACE_MD;
+ if (rv != DS_UNCHANGED)
+ goto err_out;
+ }
- if (drbd_get_capacity(mdev->this_bdev) != size ||
- drbd_bm_capacity(mdev) != size) {
+ if (drbd_get_capacity(device->this_bdev) != size ||
+ drbd_bm_capacity(device) != size) {
int err;
- err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
+ err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
if (unlikely(err)) {
/* currently there is only one error: ENOMEM! */
- size = drbd_bm_capacity(mdev)>>1;
+ size = drbd_bm_capacity(device)>>1;
if (size == 0) {
- dev_err(DEV, "OUT OF MEMORY! "
+ drbd_err(device, "OUT OF MEMORY! "
"Could not allocate bitmap!\n");
} else {
- dev_err(DEV, "BM resizing failed. "
+ drbd_err(device, "BM resizing failed. "
"Leaving size unchanged at size = %lu KB\n",
(unsigned long)size);
}
- rv = dev_size_error;
+ rv = DS_ERROR;
}
/* racy, see comments above. */
- drbd_set_my_capacity(mdev, size);
- mdev->ldev->md.la_size_sect = size;
- dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+ drbd_set_my_capacity(device, size);
+ device->ldev->md.la_size_sect = size;
+ drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
(unsigned long long)size>>1);
}
- if (rv == dev_size_error)
- goto out;
+ if (rv <= DS_ERROR)
+ goto err_out;
+
+ la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+
+ md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+ || prev_size != device->ldev->md.md_size_sect;
+
+ if (la_size_changed || md_moved || rs) {
+ u32 prev_flags;
- la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+ drbd_al_shrink(device); /* All extents inactive. */
- md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
- || prev_size != mdev->ldev->md.md_size_sect;
+ prev_flags = md->flags;
+ md->flags &= ~MDF_PRIMARY_IND;
+ drbd_md_write(device, buffer);
- if (la_size_changed || md_moved) {
- drbd_al_shrink(mdev); /* All extents inactive. */
- dev_info(DEV, "Writing the whole bitmap, %s\n",
+ drbd_info(device, "Writing the whole bitmap, %s\n",
la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved");
- rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
- drbd_md_mark_dirty(mdev);
+ /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+ "size changed", BM_LOCKED_MASK);
+ drbd_initialize_al(device, buffer);
+
+ md->flags = prev_flags;
+ drbd_md_write(device, buffer);
+
+ if (rs)
+ drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+ md->al_stripes, md->al_stripe_size_4k * 4);
}
- if (size > la_size)
- rv = grew;
- if (size < la_size)
- rv = shrunk;
-out:
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
- drbd_resume_io(mdev);
+ if (size > la_size_sect)
+ rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+ if (size < la_size_sect)
+ rv = DS_SHRUNK;
+
+ if (0) {
+ err_out:
+ if (rs) {
+ md->al_stripes = prev_al_stripes;
+ md->al_stripe_size_4k = prev_al_stripe_size_4k;
+ md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+ }
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_md_put_buffer(device);
+ drbd_resume_io(device);
return rv;
}
sector_t
-drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ sector_t u_size, int assume_peer_has_space)
{
- sector_t p_size = mdev->p_size; /* partner's disk size. */
- sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t p_size = device->p_size; /* partner's disk size. */
+ sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
- sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
sector_t size = 0;
m_size = drbd_get_max_capacity(bdev);
- if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) {
- dev_warn(DEV, "Resize while not connected was forced by the user!\n");
+ if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+ drbd_warn(device, "Resize while not connected was forced by the user!\n");
p_size = m_size;
}
if (p_size && m_size) {
size = min_t(sector_t, p_size, m_size);
} else {
- if (la_size) {
- size = la_size;
+ if (la_size_sect) {
+ size = la_size_sect;
if (m_size && m_size < size)
size = m_size;
if (p_size && p_size < size)
@@ -696,11 +1050,11 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass
}
if (size == 0)
- dev_err(DEV, "Both nodes diskless!\n");
+ drbd_err(device, "Both nodes diskless!\n");
if (u_size) {
if (u_size > size)
- dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
+ drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
(unsigned long)u_size>>1, (unsigned long)size>>1);
else
size = u_size;
@@ -711,162 +1065,407 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass
/**
* drbd_check_al_size() - Ensures that the AL is of the right size
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
* failed, and 0 on success. You should call drbd_md_sync() after you called
* this function.
*/
-static int drbd_check_al_size(struct drbd_conf *mdev)
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
{
struct lru_cache *n, *t;
struct lc_element *e;
unsigned int in_use;
int i;
- ERR_IF(mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
-
- if (mdev->act_log &&
- mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+ if (device->act_log &&
+ device->act_log->nr_elements == dc->al_extents)
return 0;
in_use = 0;
- t = mdev->act_log;
- n = lc_create("act_log", drbd_al_ext_cache,
- mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+ t = device->act_log;
+ n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+ dc->al_extents, sizeof(struct lc_element), 0);
if (n == NULL) {
- dev_err(DEV, "Cannot allocate act_log lru!\n");
+ drbd_err(device, "Cannot allocate act_log lru!\n");
return -ENOMEM;
}
- spin_lock_irq(&mdev->al_lock);
+ spin_lock_irq(&device->al_lock);
if (t) {
for (i = 0; i < t->nr_elements; i++) {
e = lc_element_by_index(t, i);
if (e->refcnt)
- dev_err(DEV, "refcnt(%d)==%d\n",
+ drbd_err(device, "refcnt(%d)==%d\n",
e->lc_number, e->refcnt);
in_use += e->refcnt;
}
}
if (!in_use)
- mdev->act_log = n;
- spin_unlock_irq(&mdev->al_lock);
+ device->act_log = n;
+ spin_unlock_irq(&device->al_lock);
if (in_use) {
- dev_err(DEV, "Activity log still in use!\n");
+ drbd_err(device, "Activity log still in use!\n");
lc_destroy(n);
return -EBUSY;
} else {
if (t)
lc_destroy(t);
}
- drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
+ drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
return 0;
}
-void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
+static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
{
- struct request_queue * const q = mdev->rq_queue;
- struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
- int max_segments = mdev->ldev->dc.max_bio_bvecs;
+ struct request_queue * const q = device->rq_queue;
+ unsigned int max_hw_sectors = max_bio_size >> 9;
+ unsigned int max_segments = 0;
+ struct request_queue *b = NULL;
- max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
+ if (get_ldev_if_state(device, D_ATTACHING)) {
+ b = device->ldev->backing_bdev->bd_disk->queue;
+
+ max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+ rcu_read_lock();
+ max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
+
+ blk_set_stacking_limits(&q->limits);
+ blk_queue_max_write_same_sectors(q, 0);
+ }
- blk_queue_max_hw_sectors(q, max_seg_s >> 9);
- blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
- blk_queue_max_segment_size(q, max_seg_s);
blk_queue_logical_block_size(q, 512);
- blk_queue_segment_boundary(q, PAGE_SIZE-1);
- blk_stack_limits(&q->limits, &b->limits, 0);
+ blk_queue_max_hw_sectors(q, max_hw_sectors);
+ /* This is the workaround for "bio would need to, but cannot, be split" */
+ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+ blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+ if (b) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ if (blk_queue_discard(b) &&
+ (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+ /* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+ q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ /* REALLY? Is stacking secdiscard "legal"? */
+ if (blk_queue_secdiscard(b))
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ } else {
+ q->limits.max_discard_sectors = 0;
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ }
- dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
+ blk_queue_stack_limits(q, b);
- if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
- dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
- q->backing_dev_info.ra_pages,
- b->backing_dev_info.ra_pages);
- q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+ if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+ drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+ q->backing_dev_info.ra_pages,
+ b->backing_dev_info.ra_pages);
+ q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+ }
+ put_ldev(device);
}
}
-/* serialize deconfig (worker exiting, doing cleanup)
- * and reconfig (drbdsetup disk, drbdsetup net)
- *
- * Wait for a potentially exiting worker, then restart it,
- * or start a new one. Flush any pending work, there may still be an
- * after_state_change queued.
- */
-static void drbd_reconfig_start(struct drbd_conf *mdev)
+void drbd_reconsider_max_bio_size(struct drbd_device *device)
{
- wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
- wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
- drbd_thread_start(&mdev->worker);
- drbd_flush_workqueue(mdev);
+ unsigned int now, new, local, peer;
+
+ now = queue_max_hw_sectors(device->rq_queue) << 9;
+ local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+ peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+ if (get_ldev_if_state(device, D_ATTACHING)) {
+ local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+ device->local_max_bio_size = local;
+ put_ldev(device);
+ }
+ local = min(local, DRBD_MAX_BIO_SIZE);
+
+ /* We may ignore peer limits if the peer is modern enough.
+ Because new from 8.3.8 onwards the peer can use multiple
+ BIOs for a single peer_request */
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
+ if (first_peer_device(device)->connection->agreed_pro_version < 94)
+ peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+ else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+ peer = DRBD_MAX_SIZE_H80_PACKET;
+ else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
+ peer = DRBD_MAX_BIO_SIZE;
+
+ /* We may later detach and re-attach on a disconnected Primary.
+ * Avoid this setting to jump back in that case.
+ * We want to store what we know the peer DRBD can handle,
+ * not what the peer IO backend can handle. */
+ if (peer > device->peer_max_bio_size)
+ device->peer_max_bio_size = peer;
+ }
+ new = min(local, peer);
+
+ if (device->state.role == R_PRIMARY && new < now)
+ drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+ if (new != now)
+ drbd_info(device, "max BIO size = %u\n", new);
+
+ drbd_setup_queue_param(device, new);
}
-/* if still unconfigured, stops worker again.
- * if configured now, clears CONFIG_PENDING.
- * wakes potential waiters */
-static void drbd_reconfig_done(struct drbd_conf *mdev)
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
{
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.disk == D_DISKLESS &&
- mdev->state.conn == C_STANDALONE &&
- mdev->state.role == R_SECONDARY) {
- set_bit(DEVICE_DYING, &mdev->flags);
- drbd_thread_stop_nowait(&mdev->worker);
- } else
- clear_bit(CONFIG_PENDING, &mdev->flags);
- spin_unlock_irq(&mdev->req_lock);
- wake_up(&mdev->state_wait);
+ drbd_thread_start(&connection->worker);
+ drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+ bool stop_threads;
+ spin_lock_irq(&connection->resource->req_lock);
+ stop_threads = conn_all_vols_unconf(connection) &&
+ connection->cstate == C_STANDALONE;
+ spin_unlock_irq(&connection->resource->req_lock);
+ if (stop_threads) {
+ /* asender is implicitly stopped by receiver
+ * in conn_disconnect() */
+ drbd_thread_stop(&connection->receiver);
+ drbd_thread_stop(&connection->worker);
+ }
}
/* Make sure IO is suspended before calling this function(). */
-static void drbd_suspend_al(struct drbd_conf *mdev)
+static void drbd_suspend_al(struct drbd_device *device)
{
int s = 0;
- if (lc_try_lock(mdev->act_log)) {
- drbd_al_shrink(mdev);
- lc_unlock(mdev->act_log);
- } else {
- dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
+ if (!lc_try_lock(device->act_log)) {
+ drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
return;
}
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn < C_CONNECTED)
- s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
-
- spin_unlock_irq(&mdev->req_lock);
+ drbd_al_shrink(device);
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED)
+ s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+ spin_unlock_irq(&device->resource->req_lock);
+ lc_unlock(device->act_log);
if (s)
- dev_info(DEV, "Suspended AL updates\n");
+ drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+ unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+ return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+ /* This is limited by 16 bit "slot" numbers,
+ * and by available on-disk context storage.
+ *
+ * Also (u16)~0 is special (denotes a "free" extent).
+ *
+ * One transaction occupies one 4kB on-disk block,
+ * we have n such blocks in the on disk ring buffer,
+ * the "current" transaction may fail (n-1),
+ * and there is 919 slot numbers context information per transaction.
+ *
+ * 72 transaction blocks amounts to more than 2**16 context slots,
+ * so cap there first.
+ */
+ const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+ const unsigned int sufficient_on_disk =
+ (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+ /AL_CONTEXT_PER_TRANSACTION;
+
+ unsigned int al_size_4k = bdev->md.al_size_4k;
+
+ if (al_size_4k > sufficient_on_disk)
+ return max_al_nr;
+
+ return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
}
-/* does always return 0;
- * interesting return code is in reply->ret_code */
-static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
{
- enum drbd_ret_codes retcode;
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_device *device;
+ struct disk_conf *new_disk_conf, *old_disk_conf;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int err, fifo_size;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* we also need a disk
+ * to change the options on */
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ if (should_set_defaults(info))
+ set_disk_conf_defaults(new_disk_conf);
+
+ err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_unlock;
+ }
+
+ if (!expect(new_disk_conf->resync_rate >= 1))
+ new_disk_conf->resync_rate = 1;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
+ new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+ }
+
+ drbd_suspend_io(device);
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+ drbd_al_shrink(device);
+ err = drbd_check_al_size(device, new_disk_conf);
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_resume_io(device);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ if (retcode == NO_ERROR) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ drbd_resync_after_changed(device);
+ }
+ write_unlock_irq(&global_state_lock);
+
+ if (retcode != NO_ERROR)
+ goto fail_unlock;
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&device->resource->conf_update);
+
+ if (new_disk_conf->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
+ else
+ set_bit(MD_NO_FUA, &device->flags);
+
+ drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+
+ drbd_md_sync(device);
+
+ if (device->state.conn >= C_CONNECTED) {
+ struct drbd_peer_device *peer_device;
+
+ for_each_peer_device(peer_device, device)
+ drbd_send_sync_param(peer_device);
+ }
+
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ kfree(old_plan);
+ mod_timer(&device->request_timer, jiffies + HZ);
+ goto success;
+
+fail_unlock:
+ mutex_unlock(&device->resource->conf_update);
+ fail:
+ kfree(new_disk_conf);
+ kfree(new_plan);
+success:
+ put_ldev(device);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int err;
+ enum drbd_ret_code retcode;
enum determine_dev_size dd;
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
- struct inode *inode, *inode2;
+ struct disk_conf *new_disk_conf = NULL;
+ struct block_device *bdev;
struct lru_cache *resync_lru = NULL;
+ struct fifo_buffer *new_plan = NULL;
union drbd_state ns, os;
- unsigned int max_seg_s;
- int rv;
- int cp_discovered = 0;
- int logical_block_size;
+ enum drbd_state_rv rv;
+ struct net_conf *nc;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
- drbd_reconfig_start(mdev);
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ conn_reconfig_start(first_peer_device(device)->connection);
/* if you want to reconfigure, please tear down first */
- if (mdev->state.disk > D_DISKLESS) {
+ if (device->state.disk > D_DISKLESS) {
retcode = ERR_DISK_CONFIGURED;
goto fail;
}
@@ -874,282 +1473,260 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* drbd_ldev_destroy is done already, we may end up here very fast,
* e.g. if someone calls attach from the on-io-error handler,
* to realize a "hot spare" feature (not that I'd recommend that) */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+ wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
- /* allocation not in the IO path, cqueue thread context */
+ /* make sure there is no leftover from previous force-detach attempts */
+ clear_bit(FORCE_DETACH, &device->flags);
+ clear_bit(WAS_IO_ERROR, &device->flags);
+ clear_bit(WAS_READ_ERROR, &device->flags);
+
+ /* and no leftover from previously aborted resync or verify, either */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+
+ /* allocation not in the IO path, drbdsetup context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
retcode = ERR_NOMEM;
goto fail;
}
+ spin_lock_init(&nbc->md.uuid_lock);
- nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
- nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
- nbc->dc.fencing = DRBD_FENCING_DEF;
- nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ nbc->disk_conf = new_disk_conf;
- if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+ set_disk_conf_defaults(new_disk_conf);
+ err = disk_conf_from_attrs(new_disk_conf, info);
+ if (err) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
goto fail;
}
- if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+ if (!new_plan) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
- if (get_net_conf(mdev)) {
- int prot = mdev->net_conf->wire_protocol;
- put_net_conf(mdev);
- if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ write_unlock_irq(&global_state_lock);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc) {
+ if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+ rcu_read_unlock();
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
+ rcu_read_unlock();
- nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
- if (IS_ERR(nbc->lo_file)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
- PTR_ERR(nbc->lo_file));
- nbc->lo_file = NULL;
+ bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
+ PTR_ERR(bdev));
retcode = ERR_OPEN_DISK;
goto fail;
}
+ nbc->backing_bdev = bdev;
- inode = nbc->lo_file->f_dentry->d_inode;
-
- if (!S_ISBLK(inode->i_mode)) {
- retcode = ERR_DISK_NOT_BDEV;
- goto fail;
- }
-
- nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
- if (IS_ERR(nbc->md_file)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
- PTR_ERR(nbc->md_file));
- nbc->md_file = NULL;
+ /*
+ * meta_dev_idx >= 0: external fixed size, possibly multiple
+ * drbd sharing one meta device. TODO in that case, paranoia
+ * check that [md_bdev, meta_dev_idx] is not yet used by some
+ * other drbd minor! (if you use drbd.conf + drbdadm, that
+ * should check it for you already; but if you don't, or
+ * someone fooled it, we need to double check here)
+ */
+ bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ (new_disk_conf->meta_dev_idx < 0) ?
+ (void *)device : (void *)drbd_m_holder);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
+ PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail;
}
+ nbc->md_bdev = bdev;
- inode2 = nbc->md_file->f_dentry->d_inode;
-
- if (!S_ISBLK(inode2->i_mode)) {
- retcode = ERR_MD_NOT_BDEV;
- goto fail;
- }
-
- nbc->backing_bdev = inode->i_bdev;
- if (bd_claim(nbc->backing_bdev, mdev)) {
- printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
- nbc->backing_bdev, mdev,
- nbc->backing_bdev->bd_holder,
- nbc->backing_bdev->bd_contains->bd_holder,
- nbc->backing_bdev->bd_holders);
- retcode = ERR_BDCLAIM_DISK;
+ if ((nbc->backing_bdev == nbc->md_bdev) !=
+ (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ retcode = ERR_MD_IDX_INVALID;
goto fail;
}
resync_lru = lc_create("resync", drbd_bm_ext_cache,
- 61, sizeof(struct bm_extent),
+ 1, 61, sizeof(struct bm_extent),
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
- goto release_bdev_fail;
- }
-
- /* meta_dev_idx >= 0: external fixed size,
- * possibly multiple drbd sharing one meta device.
- * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
- * not yet used by some other drbd minor!
- * (if you use drbd.conf + drbdadm,
- * that should check it for you already; but if you don't, or someone
- * fooled it, we need to double check here) */
- nbc->md_bdev = inode2->i_bdev;
- if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
- : (void *) drbd_m_holder)) {
- retcode = ERR_BDCLAIM_MD_DISK;
- goto release_bdev_fail;
+ goto fail;
}
- if ((nbc->backing_bdev == nbc->md_bdev) !=
- (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
- nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
- retcode = ERR_MD_IDX_INVALID;
- goto release_bdev2_fail;
- }
+ /* Read our meta data super block early.
+ * This also sets other on-disk offsets. */
+ retcode = drbd_md_read(device, nbc);
+ if (retcode != NO_ERROR)
+ goto fail;
- /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
- drbd_md_set_sector_offsets(mdev, nbc);
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+ new_disk_conf->al_extents = drbd_al_extents_max(nbc);
- if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
- dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
+ if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+ drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
- (unsigned long long) nbc->dc.disk_size);
- retcode = ERR_DISK_TO_SMALL;
- goto release_bdev2_fail;
+ (unsigned long long) new_disk_conf->disk_size);
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
}
- if (nbc->dc.meta_dev_idx < 0) {
+ if (new_disk_conf->meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
/* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
- min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+ min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
- retcode = ERR_MD_DISK_TO_SMALL;
- dev_warn(DEV, "refusing attach: md-device too small, "
+ retcode = ERR_MD_DISK_TOO_SMALL;
+ drbd_warn(device, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
- goto release_bdev2_fail;
+ goto fail;
}
/* Make sure the new disk is big enough
* (we may currently be R_PRIMARY with no local disk...) */
if (drbd_get_max_capacity(nbc) <
- drbd_get_capacity(mdev->this_bdev)) {
- retcode = ERR_DISK_TO_SMALL;
- goto release_bdev2_fail;
+ drbd_get_capacity(device->this_bdev)) {
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
}
nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
if (nbc->known_size > max_possible_sectors) {
- dev_warn(DEV, "==> truncating very big lower level device "
+ drbd_warn(device, "==> truncating very big lower level device "
"to currently maximum possible %llu sectors <==\n",
(unsigned long long) max_possible_sectors);
- if (nbc->dc.meta_dev_idx >= 0)
- dev_warn(DEV, "==>> using internal or flexible "
+ if (new_disk_conf->meta_dev_idx >= 0)
+ drbd_warn(device, "==>> using internal or flexible "
"meta data may help <<==\n");
}
- drbd_suspend_io(mdev);
+ drbd_suspend_io(device);
/* also wait for the last barrier ack. */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+ * We need a way to either ignore barrier acks for barriers sent before a device
+ * was attached, or a way to wait for all pending barrier acks to come in.
+ * As barriers are counted per resource,
+ * we'd need to suspend io on all devices of a resource.
+ */
+ wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
/* and for any other previously queued work */
- drbd_flush_workqueue(mdev);
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
- retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
- drbd_resume_io(mdev);
- if (retcode < SS_SUCCESS)
- goto release_bdev2_fail;
+ rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+ retcode = rv; /* FIXME: Type mismatch. */
+ drbd_resume_io(device);
+ if (rv < SS_SUCCESS)
+ goto fail;
- if (!get_ldev_if_state(mdev, D_ATTACHING))
+ if (!get_ldev_if_state(device, D_ATTACHING))
goto force_diskless;
- drbd_md_set_sector_offsets(mdev, nbc);
-
- /* allocate a second IO page if logical_block_size != 512 */
- logical_block_size = bdev_logical_block_size(nbc->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- if (logical_block_size != MD_SECTOR_SIZE) {
- if (!mdev->md_io_tmpp) {
- struct page *page = alloc_page(GFP_NOIO);
- if (!page)
- goto force_diskless_dec;
-
- dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
- logical_block_size, MD_SECTOR_SIZE);
- dev_warn(DEV, "Workaround engaged (has performance impact).\n");
-
- mdev->md_io_tmpp = page;
- }
- }
-
- if (!mdev->bitmap) {
- if (drbd_bm_init(mdev)) {
+ if (!device->bitmap) {
+ if (drbd_bm_init(device)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
}
- retcode = drbd_md_read(mdev, nbc);
- if (retcode != NO_ERROR)
- goto force_diskless_dec;
-
- if (mdev->state.conn < C_CONNECTED &&
- mdev->state.role == R_PRIMARY &&
- (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
- dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
- (unsigned long long)mdev->ed_uuid);
+ if (device->state.conn < C_CONNECTED &&
+ device->state.role == R_PRIMARY && device->ed_uuid &&
+ (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
retcode = ERR_DATA_NOT_CURRENT;
goto force_diskless_dec;
}
/* Since we are diskless, fix the activity log first... */
- if (drbd_check_al_size(mdev)) {
+ if (drbd_check_al_size(device, new_disk_conf)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
- drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
- dev_warn(DEV, "refusing to truncate a consistent device\n");
- retcode = ERR_DISK_TO_SMALL;
- goto force_diskless_dec;
- }
-
- if (!drbd_al_read_log(mdev, nbc)) {
- retcode = ERR_IO_MD_DISK;
+ drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
+ drbd_warn(device, "refusing to truncate a consistent device\n");
+ retcode = ERR_DISK_TOO_SMALL;
goto force_diskless_dec;
}
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
- if (nbc->dc.no_md_flush)
- set_bit(MD_NO_FUA, &mdev->flags);
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
else
- clear_bit(MD_NO_FUA, &mdev->flags);
+ set_bit(MD_NO_FUA, &device->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
- * now mdev takes over responsibility, and the state engine should
+ * now device takes over responsibility, and the state engine should
* clean it up somewhere. */
- D_ASSERT(mdev->ldev == NULL);
- mdev->ldev = nbc;
- mdev->resync = resync_lru;
+ D_ASSERT(device, device->ldev == NULL);
+ device->ldev = nbc;
+ device->resync = resync_lru;
+ device->rs_plan_s = new_plan;
nbc = NULL;
resync_lru = NULL;
+ new_disk_conf = NULL;
+ new_plan = NULL;
- mdev->write_ordering = WO_bdev_flush;
- drbd_bump_write_ordering(mdev, WO_bdev_flush);
+ drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
- if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
- set_bit(CRASHED_PRIMARY, &mdev->flags);
+ if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+ set_bit(CRASHED_PRIMARY, &device->flags);
else
- clear_bit(CRASHED_PRIMARY, &mdev->flags);
-
- if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
- !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
- set_bit(CRASHED_PRIMARY, &mdev->flags);
- cp_discovered = 1;
- }
+ clear_bit(CRASHED_PRIMARY, &device->flags);
- mdev->send_cnt = 0;
- mdev->recv_cnt = 0;
- mdev->read_cnt = 0;
- mdev->writ_cnt = 0;
+ if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+ set_bit(CRASHED_PRIMARY, &device->flags);
- max_seg_s = DRBD_MAX_SEGMENT_SIZE;
- if (mdev->state.conn == C_CONNECTED) {
- /* We are Primary, Connected, and now attach a new local
- * backing store. We must not increase the user visible maximum
- * bio size on this device to something the peer may not be
- * able to handle. */
- if (mdev->agreed_pro_version < 94)
- max_seg_s = queue_max_segment_size(mdev->rq_queue);
- else if (mdev->agreed_pro_version == 94)
- max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
- /* else: drbd 8.3.9 and later, stay with default */
- }
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ device->read_cnt = 0;
+ device->writ_cnt = 0;
- drbd_setup_queue_param(mdev, max_seg_s);
+ drbd_reconsider_max_bio_size(device);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
@@ -1165,51 +1742,50 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* so we can automatically recover from a crash of a
* degraded but active "cluster" after a certain timeout.
*/
- clear_bit(USE_DEGR_WFC_T, &mdev->flags);
- if (mdev->state.role != R_PRIMARY &&
- drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
- !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
- set_bit(USE_DEGR_WFC_T, &mdev->flags);
-
- dd = drbd_determin_dev_size(mdev, 0);
- if (dd == dev_size_error) {
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ if (device->state.role != R_PRIMARY &&
+ drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+ set_bit(USE_DEGR_WFC_T, &device->flags);
+
+ dd = drbd_determine_dev_size(device, 0, NULL);
+ if (dd <= DS_ERROR) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
- } else if (dd == grew)
- set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+ } else if (dd == DS_GREW)
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
- if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
- dev_info(DEV, "Assuming that all blocks are out of sync "
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+ (test_bit(CRASHED_PRIMARY, &device->flags) &&
+ drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+ drbd_info(device, "Assuming that all blocks are out of sync "
"(aka FullSync)\n");
- if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
} else {
- if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
+ if (drbd_bitmap_io(device, &drbd_bm_read,
+ "read from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
}
- if (cp_discovered) {
- drbd_al_apply_to_bm(mdev);
- drbd_al_to_on_disk_bm(mdev);
- }
-
- if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
- drbd_suspend_al(mdev); /* IO is still suspended here... */
+ if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+ drbd_suspend_al(device); /* IO is still suspended here... */
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- ns.i = os.i;
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
+ ns = os;
/* If MDF_CONSISTENT is not set go into inconsistent state,
otherwise investigate MDF_WasUpToDate...
If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
otherwise into D_CONSISTENT state.
*/
- if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
- if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
+ if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+ if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
ns.disk = D_CONSISTENT;
else
ns.disk = D_OUTDATED;
@@ -1217,11 +1793,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
ns.disk = D_INCONSISTENT;
}
- if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
+ if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
- if ( ns.disk == D_CONSISTENT &&
- (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+ rcu_read_lock();
+ if (ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
ns.disk = D_UP_TO_DATE;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
@@ -1229,1328 +1806,1855 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
this point, because drbd_request_state() modifies these
flags. */
+ if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ rcu_read_unlock();
+
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */
- if (mdev->state.conn == C_CONNECTED) {
- mdev->new_state_tmp.i = ns.i;
+ if (device->state.conn == C_CONNECTED) {
+ device->new_state_tmp.i = ns.i;
ns.i = os.i;
ns.disk = D_NEGOTIATING;
/* We expect to receive up-to-date UUIDs soon.
To avoid a race in receive_state, free p_uuid while
holding req_lock. I.e. atomic with the state change */
- kfree(mdev->p_uuid);
- mdev->p_uuid = NULL;
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
}
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ spin_unlock_irq(&device->resource->req_lock);
if (rv < SS_SUCCESS)
goto force_diskless_dec;
- if (mdev->state.role == R_PRIMARY)
- mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ mod_timer(&device->request_timer, jiffies + HZ);
+
+ if (device->state.role == R_PRIMARY)
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
else
- mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
- drbd_md_mark_dirty(mdev);
- drbd_md_sync(mdev);
+ drbd_md_mark_dirty(device);
+ drbd_md_sync(device);
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- put_ldev(mdev);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+ put_ldev(device);
+ conn_reconfig_done(first_peer_device(device)->connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
force_diskless_dec:
- put_ldev(mdev);
+ put_ldev(device);
force_diskless:
- drbd_force_state(mdev, NS(disk, D_FAILED));
- drbd_md_sync(mdev);
- release_bdev2_fail:
- if (nbc)
- bd_release(nbc->md_bdev);
- release_bdev_fail:
- if (nbc)
- bd_release(nbc->backing_bdev);
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+ drbd_md_sync(device);
fail:
+ conn_reconfig_done(first_peer_device(device)->connection);
if (nbc) {
- if (nbc->lo_file)
- fput(nbc->lo_file);
- if (nbc->md_file)
- fput(nbc->md_file);
+ if (nbc->backing_bdev)
+ blkdev_put(nbc->backing_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (nbc->md_bdev)
+ blkdev_put(nbc->md_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc);
}
+ kfree(new_disk_conf);
lc_destroy(resync_lru);
-
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ kfree(new_plan);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
+static int adm_detach(struct drbd_device *device, int force)
+{
+ enum drbd_state_rv retcode;
+ int ret;
+
+ if (force) {
+ set_bit(FORCE_DETACH, &device->flags);
+ drbd_force_state(device, NS(disk, D_FAILED));
+ retcode = SS_SUCCESS;
+ goto out;
+ }
+
+ drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+ drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+ retcode = drbd_request_state(device, NS(disk, D_FAILED));
+ drbd_md_put_buffer(device);
+ /* D_FAILED will transition to DISKLESS. */
+ ret = wait_event_interruptible(device->misc_wait,
+ device->state.disk != D_FAILED);
+ drbd_resume_io(device);
+ if ((int)retcode == (int)SS_IS_DISKLESS)
+ retcode = SS_NOTHING_TO_DO;
+ if (ret)
+ retcode = ERR_INTR;
+out:
+ return retcode;
+}
+
/* Detaching the disk is a process in multiple stages. First we need to lock
* out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
* Then we transition to D_DISKLESS, and wait for put_ldev() to return all
* internal references as well.
* Only then we have finally detached. */
-static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
- reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
- if (mdev->state.disk == D_DISKLESS)
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
- drbd_resume_io(mdev);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct detach_parms parms = { };
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_detach(adm_ctx.device, parms.force_detach);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static bool conn_resync_running(struct drbd_connection *connection)
{
- int i, ns;
- enum drbd_ret_codes retcode;
- struct net_conf *new_conf = NULL;
- struct crypto_hash *tfm = NULL;
- struct crypto_hash *integrity_w_tfm = NULL;
- struct crypto_hash *integrity_r_tfm = NULL;
- struct hlist_head *new_tl_hash = NULL;
- struct hlist_head *new_ee_hash = NULL;
- struct drbd_conf *odev;
- char hmac_name[CRYPTO_MAX_ALG_NAME];
- void *int_dig_out = NULL;
- void *int_dig_in = NULL;
- void *int_dig_vv = NULL;
- struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_SYNC_SOURCE ||
+ device->state.conn == C_SYNC_TARGET ||
+ device->state.conn == C_PAUSED_SYNC_S ||
+ device->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
- drbd_reconfig_start(mdev);
+ return rv;
+}
- if (mdev->state.conn > C_STANDALONE) {
- retcode = ERR_NET_CONFIGURED;
- goto fail;
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_VERIFY_S ||
+ device->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
+ }
}
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+ if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+ return ERR_NEED_APV_100;
- /* allocation not in the IO path, cqueue thread context */
- new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
- if (!new_conf) {
+ if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
+ if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+ return ERR_NEED_APV_100;
+ }
+
+ if (!new_net_conf->two_primaries &&
+ conn_highest_role(connection) == R_PRIMARY &&
+ conn_highest_peer(connection) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
+
+ if (new_net_conf->two_primaries &&
+ (new_net_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev(device)) {
+ enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
+ }
+ if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
+ }
+
+ if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+ static enum drbd_ret_code rv;
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ rcu_read_lock();
+ rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+ rcu_read_unlock();
+
+ /* connection->volumes protected by genl_lock() here */
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (!device->bitmap) {
+ if (drbd_bm_init(device))
+ return ERR_NOMEM;
+ }
+ }
+
+ return rv;
+}
+
+struct crypto {
+ struct crypto_hash *verify_tfm;
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+ if (!tfm_name[0])
+ return NO_ERROR;
+
+ *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*tfm)) {
+ *tfm = NULL;
+ return err_alg;
+ }
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+ enum drbd_ret_code rv;
+
+ rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
+ ERR_CSUMS_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
+ ERR_VERIFY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+ ERR_INTEGRITY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ if (new_net_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_net_conf->cram_hmac_alg);
+
+ rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+ ERR_AUTH_ALG);
+ }
+
+ return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+ crypto_free_hash(crypto->cram_hmac_tfm);
+ crypto_free_hash(crypto->integrity_tfm);
+ crypto_free_hash(crypto->csums_tfm);
+ crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_connection *connection;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto crypto = { };
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ connection = adm_ctx.connection;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
retcode = ERR_NOMEM;
+ goto out;
+ }
+
+ conn_reconfig_start(connection);
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+
+ if (!old_net_conf) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+ retcode = ERR_INVALID_REQUEST;
goto fail;
}
- new_conf->timeout = DRBD_TIMEOUT_DEF;
- new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
- new_conf->ping_int = DRBD_PING_INT_DEF;
- new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
- new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
- new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
- new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
- new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
- new_conf->ko_count = DRBD_KO_COUNT_DEF;
- new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
- new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
- new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
- new_conf->want_lose = 0;
- new_conf->two_primaries = 0;
- new_conf->wire_protocol = DRBD_PROT_C;
- new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
- new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
-
- if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+ *new_net_conf = *old_net_conf;
+ if (should_set_defaults(info))
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs_for_change(new_net_conf, info);
+ if (err && err != -ENOMSG) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
goto fail;
}
- if (new_conf->two_primaries
- && (new_conf->wire_protocol != DRBD_PROT_C)) {
- retcode = ERR_NOT_PROTO_C;
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ /* re-sync running */
+ rsr = conn_resync_running(connection);
+ if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
goto fail;
}
- if (get_ldev(mdev)) {
- enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
- retcode = ERR_STONITH_AND_PROT_A;
- goto fail;
- }
+ /* online verify running */
+ ovr = conn_ov_running(connection);
+ if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
+ goto fail;
}
- if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
- retcode = ERR_DISCARD;
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
goto fail;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ if (!rsr) {
+ crypto_free_hash(connection->csums_tfm);
+ connection->csums_tfm = crypto.csums_tfm;
+ crypto.csums_tfm = NULL;
+ }
+ if (!ovr) {
+ crypto_free_hash(connection->verify_tfm);
+ connection->verify_tfm = crypto.verify_tfm;
+ crypto.verify_tfm = NULL;
}
- retcode = NO_ERROR;
+ crypto_free_hash(connection->integrity_tfm);
+ connection->integrity_tfm = crypto.integrity_tfm;
+ if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+ /* Do this without trying to take connection->data.mutex again. */
+ __drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
- new_my_addr = (struct sockaddr *)&new_conf->my_addr;
- new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev || odev == mdev)
- continue;
- if (get_net_conf(odev)) {
- taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
- if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
- !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
- retcode = ERR_LOCAL_ADDR;
+ crypto_free_hash(connection->cram_hmac_tfm);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
- taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
- if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
- !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
- retcode = ERR_PEER_ADDR;
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ synchronize_rcu();
+ kfree(old_net_conf);
- put_net_conf(odev);
- if (retcode != NO_ERROR)
- goto fail;
- }
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ drbd_send_sync_param(peer_device);
}
- if (new_conf->cram_hmac_alg[0] != 0) {
- snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
- new_conf->cram_hmac_alg);
- tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm)) {
- tfm = NULL;
- retcode = ERR_AUTH_ALG;
- goto fail;
- }
+ goto done;
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- retcode = ERR_AUTH_ALG_ND;
- goto fail;
- }
- }
+ fail:
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+ done:
+ conn_reconfig_done(connection);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
- if (new_conf->integrity_alg[0]) {
- integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_w_tfm)) {
- integrity_w_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct crypto crypto = { };
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ enum drbd_ret_code retcode;
+ int i;
+ int err;
- if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
- retcode=ERR_INTEGRITY_ALG_ND;
- goto fail;
- }
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
- integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_r_tfm)) {
- integrity_r_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
}
- ns = new_conf->max_epoch_size/8;
- if (mdev->tl_hash_s != ns) {
- new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_tl_hash) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
+ /* No need for _rcu here. All reconfiguration is
+ * strictly serialized on genl_lock(). We are protected against
+ * concurrent reconfiguration/addition/deletion */
+ for_each_resource(resource, &drbd_resources) {
+ for_each_connection(connection, resource) {
+ if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+ !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+ connection->my_addr_len)) {
+ retcode = ERR_LOCAL_ADDR;
+ goto out;
+ }
- ns = new_conf->max_buffers/8;
- if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
- new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_ee_hash) {
- retcode = ERR_NOMEM;
- goto fail;
+ if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+ !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+ connection->peer_addr_len)) {
+ retcode = ERR_PEER_ADDR;
+ goto out;
+ }
}
}
- ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ connection = first_connection(adm_ctx.resource);
+ conn_reconfig_start(connection);
- if (integrity_w_tfm) {
- i = crypto_hash_digestsize(integrity_w_tfm);
- int_dig_out = kmalloc(i, GFP_KERNEL);
- if (!int_dig_out) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_in = kmalloc(i, GFP_KERNEL);
- if (!int_dig_in) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_vv = kmalloc(i, GFP_KERNEL);
- if (!int_dig_vv) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ if (connection->cstate > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto fail;
}
- if (!mdev->bitmap) {
- if(drbd_bm_init(mdev)) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ /* allocation not in the IO path, drbdsetup / netlink process context */
+ new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
}
- drbd_flush_workqueue(mdev);
- spin_lock_irq(&mdev->req_lock);
- if (mdev->net_conf != NULL) {
- retcode = ERR_NET_CONFIGURED;
- spin_unlock_irq(&mdev->req_lock);
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs(new_net_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
goto fail;
}
- mdev->net_conf = new_conf;
- mdev->send_cnt = 0;
- mdev->recv_cnt = 0;
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
- if (new_tl_hash) {
- kfree(mdev->tl_hash);
- mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
- mdev->tl_hash = new_tl_hash;
- }
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
- if (new_ee_hash) {
- kfree(mdev->ee_hash);
- mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
- mdev->ee_hash = new_ee_hash;
+ ((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+ drbd_flush_workqueue(&connection->sender_work);
+
+ mutex_lock(&adm_ctx.resource->conf_update);
+ old_net_conf = connection->net_conf;
+ if (old_net_conf) {
+ retcode = ERR_NET_CONFIGURED;
+ mutex_unlock(&adm_ctx.resource->conf_update);
+ goto fail;
}
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ conn_free_crypto(connection);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+ connection->integrity_tfm = crypto.integrity_tfm;
+ connection->csums_tfm = crypto.csums_tfm;
+ connection->verify_tfm = crypto.verify_tfm;
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = tfm;
+ connection->my_addr_len = nla_len(adm_ctx.my_addr);
+ memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+ connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+ memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = integrity_w_tfm;
+ mutex_unlock(&adm_ctx.resource->conf_update);
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = integrity_r_tfm;
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ }
+ rcu_read_unlock();
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
- mdev->int_dig_out=int_dig_out;
- mdev->int_dig_in=int_dig_in;
- mdev->int_dig_vv=int_dig_vv;
- retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
fail:
- kfree(int_dig_out);
- kfree(int_dig_in);
- kfree(int_dig_vv);
- crypto_free_hash(tfm);
- crypto_free_hash(integrity_w_tfm);
- crypto_free_hash(integrity_r_tfm);
- kfree(new_tl_hash);
- kfree(new_ee_hash);
- kfree(new_conf);
-
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
{
- int retcode;
+ enum drbd_state_rv rv;
+
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ force ? CS_HARD : 0);
+
+ switch (rv) {
+ case SS_NOTHING_TO_DO:
+ break;
+ case SS_ALREADY_STANDALONE:
+ return SS_SUCCESS;
+ case SS_PRIMARY_NOP:
+ /* Our state checking code wants to see the peer outdated. */
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
- retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+ if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
- if (retcode == SS_NOTHING_TO_DO)
- goto done;
- else if (retcode == SS_ALREADY_STANDALONE)
- goto done;
- else if (retcode == SS_PRIMARY_NOP) {
- /* Our statche checking code wants to see the peer outdated. */
- retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- pdsk, D_OUTDATED));
- } else if (retcode == SS_CW_FAILED_BY_PEER) {
+ break;
+ case SS_CW_FAILED_BY_PEER:
/* The peer probably wants to see us outdated. */
- retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- disk, D_OUTDATED),
- CS_ORDERED);
- if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- retcode = SS_SUCCESS;
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED), 0);
+ if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ CS_HARD);
}
+ break;
+ default:;
+ /* no special handling necessary */
+ }
+
+ if (rv >= SS_SUCCESS) {
+ enum drbd_state_rv rv2;
+ /* No one else can reconfigure the network while I am here.
+ * The state handling only uses drbd_thread_stop_nowait(),
+ * we want to really wait here until the receiver is no more.
+ */
+ drbd_thread_stop(&connection->receiver);
+
+ /* Race breaker. This additional state change request may be
+ * necessary, if this was a forced disconnect during a receiver
+ * restart. We may have "killed" the receiver thread just
+ * after drbd_receiver() returned. Typically, we should be
+ * C_STANDALONE already, now, and this becomes a no-op.
+ */
+ rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+ CS_VERBOSE | CS_HARD);
+ if (rv2 < SS_SUCCESS)
+ drbd_err(connection,
+ "unexpected rv2=%d in conn_try_disconnect()\n",
+ rv2);
}
+ return rv;
+}
- if (retcode < SS_SUCCESS)
- goto fail;
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct disconnect_parms parms;
+ struct drbd_connection *connection;
+ enum drbd_state_rv rv;
+ enum drbd_ret_code retcode;
+ int err;
- if (wait_event_interruptible(mdev->state_wait,
- mdev->state.conn != C_DISCONNECTING)) {
- /* Do not test for mdev->state.conn == C_STANDALONE, since
- someone else might connect us in the mean time! */
- retcode = ERR_INTR;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
goto fail;
+
+ connection = adm_ctx.connection;
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+ err = disconnect_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
}
- done:
- retcode = NO_ERROR;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ rv = conn_try_disconnect(connection, parms.force_disconnect);
+ if (rv < SS_SUCCESS)
+ retcode = rv; /* FIXME: Type mismatch. */
+ else
+ retcode = NO_ERROR;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
fail:
- drbd_md_sync(mdev);
- reply->ret_code = retcode;
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-void resync_after_online_grow(struct drbd_conf *mdev)
+void resync_after_online_grow(struct drbd_device *device)
{
int iass; /* I am sync source */
- dev_info(DEV, "Resync of new storage after online grow\n");
- if (mdev->state.role != mdev->state.peer)
- iass = (mdev->state.role == R_PRIMARY);
+ drbd_info(device, "Resync of new storage after online grow\n");
+ if (device->state.role != device->state.peer)
+ iass = (device->state.role == R_PRIMARY);
else
- iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+ iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
if (iass)
- drbd_start_resync(mdev, C_SYNC_SOURCE);
+ drbd_start_resync(device, C_SYNC_SOURCE);
else
- _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
}
-static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
{
- struct resize rs;
- int retcode = NO_ERROR;
+ struct drbd_config_context adm_ctx;
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+ struct resize_parms rs;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
enum determine_dev_size dd;
+ bool change_al_layout = false;
enum dds_flags ddsf;
+ sector_t u_size;
+ int err;
- memset(&rs, 0, sizeof(struct resize));
- if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
- }
-
- if (mdev->state.conn > C_CONNECTED) {
- retcode = ERR_RESIZE_RESYNC;
- goto fail;
- }
-
- if (mdev->state.role == R_SECONDARY &&
- mdev->state.peer == R_SECONDARY) {
- retcode = ERR_NO_PRIMARY;
- goto fail;
- }
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
- if (!get_ldev(mdev)) {
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
retcode = ERR_NO_DISK;
goto fail;
}
- if (rs.no_resync && mdev->agreed_pro_version < 93) {
- retcode = ERR_NEED_APV_93;
- goto fail;
- }
-
- if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
- mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
-
- mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
- ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
- dd = drbd_determin_dev_size(mdev, ddsf);
- drbd_md_sync(mdev);
- put_ldev(mdev);
- if (dd == dev_size_error) {
- retcode = ERR_NOMEM_BITMAP;
- goto fail;
- }
-
- if (mdev->state.conn == C_CONNECTED) {
- if (dd == grew)
- set_bit(RESIZE_PENDING, &mdev->flags);
-
- drbd_send_uuids(mdev);
- drbd_send_sizes(mdev, 1, ddsf);
+ memset(&rs, 0, sizeof(struct resize_parms));
+ rs.al_stripes = device->ldev->md.al_stripes;
+ rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+ if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+ err = resize_parms_from_attrs(&rs, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_ldev;
+ }
}
- fail:
- reply->ret_code = retcode;
- return 0;
-}
-
-static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- int retcode = NO_ERROR;
- int err;
- int ovr; /* online verify running */
- int rsr; /* re-sync running */
- struct crypto_hash *verify_tfm = NULL;
- struct crypto_hash *csums_tfm = NULL;
- struct syncer_conf sc;
- cpumask_var_t new_cpu_mask;
- int *rs_plan_s = NULL;
- int fifo_size;
-
- if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
- retcode = ERR_NOMEM;
- goto fail;
+ if (device->state.conn > C_CONNECTED) {
+ retcode = ERR_RESIZE_RESYNC;
+ goto fail_ldev;
}
- if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
- memset(&sc, 0, sizeof(struct syncer_conf));
- sc.rate = DRBD_RATE_DEF;
- sc.after = DRBD_AFTER_DEF;
- sc.al_extents = DRBD_AL_EXTENTS_DEF;
- sc.on_no_data = DRBD_ON_NO_DATA_DEF;
- sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
- sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
- sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
- sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
- sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
- } else
- memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
-
- if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
+ if (device->state.role == R_SECONDARY &&
+ device->state.peer == R_SECONDARY) {
+ retcode = ERR_NO_PRIMARY;
+ goto fail_ldev;
}
- /* re-sync running */
- rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
- mdev->state.conn == C_SYNC_TARGET ||
- mdev->state.conn == C_PAUSED_SYNC_S ||
- mdev->state.conn == C_PAUSED_SYNC_T );
-
- if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
- retcode = ERR_CSUMS_RESYNC_RUNNING;
- goto fail;
+ if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+ retcode = ERR_NEED_APV_93;
+ goto fail_ldev;
}
- if (!rsr && sc.csums_alg[0]) {
- csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(csums_tfm)) {
- csums_tfm = NULL;
- retcode = ERR_CSUMS_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
- retcode = ERR_CSUMS_ALG_ND;
- goto fail;
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ if (u_size != (sector_t)rs.resize_size) {
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail_ldev;
}
}
- /* online verify running */
- ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+ if (device->ldev->md.al_stripes != rs.al_stripes ||
+ device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+ u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
- if (ovr) {
- if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
- retcode = ERR_VERIFY_RUNNING;
- goto fail;
+ if (al_size_k > (16 * 1024 * 1024)) {
+ retcode = ERR_MD_LAYOUT_TOO_BIG;
+ goto fail_ldev;
}
- }
- if (!ovr && sc.verify_alg[0]) {
- verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(verify_tfm)) {
- verify_tfm = NULL;
- retcode = ERR_VERIFY_ALG;
- goto fail;
+ if (al_size_k < MD_32kB_SECT/2) {
+ retcode = ERR_MD_LAYOUT_TOO_SMALL;
+ goto fail_ldev;
}
- if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
- retcode = ERR_VERIFY_ALG_ND;
- goto fail;
+ if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+ retcode = ERR_MD_LAYOUT_CONNECTED;
+ goto fail_ldev;
}
- }
- /* silently ignore cpu mask on UP kernel */
- if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
- err = __bitmap_parse(sc.cpu_mask, 32, 0,
- cpumask_bits(new_cpu_mask), nr_cpu_ids);
- if (err) {
- dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
- retcode = ERR_CPU_MASK_PARSE;
- goto fail;
- }
+ change_al_layout = true;
}
- ERR_IF (sc.rate < 1) sc.rate = 1;
- ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
-#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
- if (sc.al_extents > AL_MAX) {
- dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
- sc.al_extents = AL_MAX;
- }
-#undef AL_MAX
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
- /* to avoid spurious errors when configuring minors before configuring
- * the minors they depend on: if necessary, first create the minor we
- * depend on */
- if (sc.after >= 0)
- ensure_mdev(sc.after, 1);
+ if (new_disk_conf) {
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = (sector_t)rs.resize_size;
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&device->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ }
- /* most sanity checks done, try to assign the new sync-after
- * dependency. need to hold the global lock in there,
- * to avoid a race in the dependency loop check. */
- retcode = drbd_alter_sa(mdev, sc.after);
- if (retcode != NO_ERROR)
+ ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+ dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+ drbd_md_sync(device);
+ put_ldev(device);
+ if (dd == DS_ERROR) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto fail;
+ } else if (dd == DS_ERROR_SPACE_MD) {
+ retcode = ERR_MD_LAYOUT_NO_FIT;
+ goto fail;
+ } else if (dd == DS_ERROR_SHRINK) {
+ retcode = ERR_IMPLICIT_SHRINK;
goto fail;
-
- fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
- if (!rs_plan_s) {
- dev_err(DEV, "kmalloc of fifo_buffer failed");
- retcode = ERR_NOMEM;
- goto fail;
- }
}
- /* ok, assign the rest of it as well.
- * lock against receive_SyncParam() */
- spin_lock(&mdev->peer_seq_lock);
- mdev->sync_conf = sc;
+ if (device->state.conn == C_CONNECTED) {
+ if (dd == DS_GREW)
+ set_bit(RESIZE_PENDING, &device->flags);
- if (!rsr) {
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- csums_tfm = NULL;
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_sizes(first_peer_device(device), 1, ddsf);
}
- if (!ovr) {
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- verify_tfm = NULL;
- }
+ fail:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
- rs_plan_s = NULL;
- }
+ fail_ldev:
+ put_ldev(device);
+ goto fail;
+}
- spin_unlock(&mdev->peer_seq_lock);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
- if (get_ldev(mdev)) {
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
- drbd_al_shrink(mdev);
- err = drbd_check_al_size(mdev);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
- put_ldev(mdev);
- drbd_md_sync(mdev);
+ res_opts = adm_ctx.resource->res_opts;
+ if (should_set_defaults(info))
+ set_res_opts_defaults(&res_opts);
- if (err) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
}
- if (mdev->state.conn >= C_CONNECTED)
- drbd_send_sync_param(mdev, &sc);
-
- if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
- cpumask_copy(mdev->cpu_mask, new_cpu_mask);
- drbd_calc_cpu_mask(mdev);
- mdev->receiver.reset_cpu_mask = 1;
- mdev->asender.reset_cpu_mask = 1;
- mdev->worker.reset_cpu_mask = 1;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ err = set_resource_options(adm_ctx.resource, &res_opts);
+ if (err) {
+ retcode = ERR_INVALID_REQUEST;
+ if (err == -ENOMEM)
+ retcode = ERR_NOMEM;
}
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
- kfree(rs_plan_s);
- free_cpumask_var(new_cpu_mask);
- crypto_free_hash(csums_tfm);
- crypto_free_hash(verify_tfm);
- reply->ret_code = retcode;
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
- retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
- retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
- while (retcode == SS_NEED_CONNECTION) {
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn < C_CONNECTED)
- retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
- if (retcode != SS_NEED_CONNECTION)
- break;
+ /* If we happen to be C_STANDALONE R_SECONDARY, just change to
+ * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
+ * try to start a resync handshake as sync target for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+ retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from invalidate", BM_LOCKED_MASK))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
- retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
- }
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+ union drbd_state mask, union drbd_state val)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
- reply->ret_code = retcode;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_request_state(adm_ctx.device, mask, val);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
+static int drbd_bmio_set_susp_al(struct drbd_device *device)
{
int rv;
- rv = drbd_bmio_set_n_write(mdev);
- drbd_suspend_al(mdev);
+ rv = drbd_bmio_set_n_write(device);
+ drbd_suspend_al(device);
return rv;
}
-static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ struct drbd_config_context adm_ctx;
+ int retcode; /* drbd_ret_code, drbd_state_rv */
+ struct drbd_device *device;
- retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (retcode < SS_SUCCESS) {
- if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
- /* The peer will get a resync upon connect anyways. Just make that
- into a full resync. */
- retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
- if (retcode >= SS_SUCCESS) {
- /* open coded drbd_bitmap_io() */
- if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
- "set_n_write from invalidate_peer"))
- retcode = ERR_IO_MD_DISK;
- }
- } else
- retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
- }
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
- reply->ret_code = retcode;
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+ /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+ * in the bitmap. Otherwise, try to start a resync handshake
+ * as sync source for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
- if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
- retcode = ERR_PAUSE_IS_SET;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- reply->ret_code = retcode;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_SET;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ struct drbd_config_context adm_ctx;
+ union drbd_dev_state s;
+ enum drbd_ret_code retcode;
- if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
- retcode = ERR_PAUSE_IS_CLEAR;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- reply->ret_code = retcode;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+ s = adm_ctx.device->state;
+ if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+ retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+ s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+ } else {
+ retcode = ERR_PAUSE_IS_CLEAR;
+ }
+ }
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
-
- return 0;
+ return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
}
-static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
{
- if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
- drbd_uuid_new_current(mdev);
- clear_bit(NEW_CUR_UUID, &mdev->flags);
- }
- drbd_suspend_io(mdev);
- reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
- if (reply->ret_code == SS_SUCCESS) {
- if (mdev->state.conn < C_CONNECTED)
- tl_clear(mdev);
- if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
- tl_restart(mdev, fail_frozen_disk_io);
- }
- drbd_resume_io(mdev);
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ drbd_suspend_io(device);
+ retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (retcode == SS_SUCCESS) {
+ if (device->state.conn < C_CONNECTED)
+ tl_clear(first_peer_device(device)->connection);
+ if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+ tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+ }
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ struct drbd_device *device)
+{
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (device &&
+ nla_put_u32(skb, T_ctx_volume, device->vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+ goto nla_put_failure;
+ if (connection) {
+ if (connection->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+ goto nla_put_failure;
+ if (connection->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nla);
return 0;
+
+nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
}
-static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
{
- unsigned short *tl;
+ struct list_head *connections = &resource->connections;
- tl = reply->tag_list;
+ if (list_empty(connections) || connections->next->next != connections)
+ return NULL;
+ return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
- if (get_ldev(mdev)) {
- tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
- put_ldev(mdev);
- }
+int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+ const struct sib_info *sib)
+{
+ struct drbd_resource *resource = device->resource;
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
+
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+ got_ldev = get_ldev(device);
+
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+ goto nla_put_failure;
+
+ if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (got_ldev) {
+ struct disk_conf *disk_conf;
+
+ disk_conf = rcu_dereference(device->ldev->disk_conf);
+ err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+ }
+ if (!err) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ }
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, device->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+ goto nla_put_failure;
+
+ if (got_ldev) {
+ int err;
- if (get_net_conf(mdev)) {
- tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
- put_net_conf(mdev);
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= device->state.conn &&
+ C_PAUSED_SYNC_T >= device->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+ goto nla_put_failure;
+ }
}
- tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
- return (int)((char *)tl - (char *)reply->tag_list);
+ if (0)
+nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
+ put_ldev(device);
+ return err;
}
-static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
{
- unsigned short *tl = reply->tag_list;
- union drbd_state s = mdev->state;
- unsigned long rs_left;
- unsigned int res;
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ int err;
- tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- /* no local ref, no bitmap, no syncer progress. */
- if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
- if (get_ldev(mdev)) {
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_ldev(mdev);
- }
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- return (int)((char *)tl - (char *)reply->tag_list);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
+ struct drbd_device *device;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+ struct drbd_resource *resource = NULL;
+ struct drbd_resource *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * for_each_resource_safe(resource, tmp, &drbd_resources) {
+ * connection = "first connection of resource or undefined";
+ * idr_for_each_entry(&resource->devices, device, i) {
+ * ...
+ * }
+ * }
+ * where resource is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the device won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
+
+ /* synchronize with conn_create()/drbd_destroy_connection() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ for_each_resource_rcu(tmp, &drbd_resources) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ resource = pos;
+ break;
+ }
+ if (tmp == pos) {
+ resource = pos;
+ break;
+ }
+ }
+ if (resource) {
+next_resource:
+ device = idr_get_next(&resource->devices, &volume);
+ if (!device) {
+ /* No more volumes to dump on this resource.
+ * Advance resource iterator. */
+ pos = list_entry_rcu(resource->resources.next,
+ struct drbd_resource, resources);
+ /* Did we dump any volume of this resource yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->resources == &drbd_resources || cb->args[2])
+ goto out;
+ volume = 0;
+ resource = pos;
+ goto next_resource;
+ }
+ }
- tl = reply->tag_list;
+ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!device) {
+ /* This is a connection without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct drbd_connection *connection;
+
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ connection = the_only_connection(resource);
+ if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+ goto cancel;
+ if (connection) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ }
+ goto done;
+ }
- if (get_ldev(mdev)) {
- tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
- tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
- put_ldev(mdev);
+ D_ASSERT(device, device->vnr == volume);
+ D_ASSERT(device, device->resource == resource);
+
+ dh->minor = device_to_minor(device);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+done:
+ genlmsg_end(skb, dh);
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
- return (int)((char *)tl - (char *)reply->tag_list);
+out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+ /* No more resources/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
}
-/**
- * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
- * @mdev: DRBD device.
- * @nlp: Netlink/connector packet from drbdsetup
- * @reply: Reply packet for drbdsetup
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
*/
-static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
- char rv;
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_resource *resource;
+ int maxtype;
+
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
+
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
+
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ if (!*resource_name)
+ return -ENODEV;
+ resource = drbd_find_resource(resource_name);
+ if (!resource)
+ return -ENODEV;
+
+ kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this connection. */
+ cb->args[0] = (long)resource;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)resource;
+
+dump:
+ return get_one_status(skb, cb);
+}
- tl = reply->tag_list;
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
- rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
- test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ tp.timeout_type =
+ adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
- return (int)((char *)tl - (char *)reply->tag_list);
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
{
- /* default to resume from last known position, if possible */
- struct start_ov args =
- { .start_sector = mdev->ov_start_sector };
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ device = adm_ctx.device;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = device->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
}
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
/* w_make_ov_request expects position to be aligned */
- mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
- reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ device->ov_stop_sector = parms.ov_stop_sector;
+
+ /* If there is still bitmap IO pending, e.g. previous resync or verify
+ * just being finished, wait for it before requesting a new resync. */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+ drbd_resume_io(device);
+
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
int skip_initial_sync = 0;
int err;
+ struct new_c_uuid_parms args;
- struct new_c_uuid args;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out_nolock;
- memset(&args, 0, sizeof(struct new_c_uuid));
- if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ device = adm_ctx.device;
+ memset(&args, 0, sizeof(args));
+ if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+ err = new_c_uuid_parms_from_attrs(&args, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out_nolock;
+ }
}
- mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
- if (!get_ldev(mdev)) {
+ if (!get_ldev(device)) {
retcode = ERR_NO_DISK;
goto out;
}
/* this is "skip initial sync", assume to be clean */
- if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
- mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
- dev_info(DEV, "Preparing to skip initial sync\n");
+ if (device->state.conn == C_CONNECTED &&
+ first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+ drbd_info(device, "Preparing to skip initial sync\n");
skip_initial_sync = 1;
- } else if (mdev->state.conn != C_STANDALONE) {
+ } else if (device->state.conn != C_STANDALONE) {
retcode = ERR_CONNECTED;
goto out_dec;
}
- drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
- drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
+ drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+ drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
if (args.clear_bm) {
- err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+ err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
if (err) {
- dev_err(DEV, "Writing bitmap failed with %d\n",err);
+ drbd_err(device, "Writing bitmap failed with %d\n", err);
retcode = ERR_IO_MD_DISK;
}
if (skip_initial_sync) {
- drbd_send_uuids_skip_initial_sync(mdev);
- _drbd_uuid_set(mdev, UI_BITMAP, 0);
- spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ drbd_print_uuids(device, "cleared bitmap UUID");
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&device->resource->req_lock);
}
}
- drbd_md_sync(mdev);
+ drbd_md_sync(device);
out_dec:
- put_ldev(mdev);
+ put_ldev(device);
out:
- mutex_unlock(&mdev->state_mutex);
-
- reply->ret_code = retcode;
+ mutex_unlock(device->state_mutex);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-struct cn_handler_struct {
- int (*function)(struct drbd_conf *,
- struct drbd_nl_cfg_req *,
- struct drbd_nl_cfg_reply *);
- int reply_body_size;
-};
-
-static struct cn_handler_struct cnd_table[] = {
- [ P_primary ] = { &drbd_nl_primary, 0 },
- [ P_secondary ] = { &drbd_nl_secondary, 0 },
- [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
- [ P_detach ] = { &drbd_nl_detach, 0 },
- [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
- [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
- [ P_resize ] = { &drbd_nl_resize, 0 },
- [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
- [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
- [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
- [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
- [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
- [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
- [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
- [ P_outdate ] = { &drbd_nl_outdate, 0 },
- [ P_get_config ] = { &drbd_nl_get_config,
- sizeof(struct syncer_conf_tag_len_struct) +
- sizeof(struct disk_conf_tag_len_struct) +
- sizeof(struct net_conf_tag_len_struct) },
- [ P_get_state ] = { &drbd_nl_get_state,
- sizeof(struct get_state_tag_len_struct) +
- sizeof(struct sync_progress_tag_len_struct) },
- [ P_get_uuids ] = { &drbd_nl_get_uuids,
- sizeof(struct get_uuids_tag_len_struct) },
- [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
- sizeof(struct get_timeout_flag_tag_len_struct)},
- [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
- [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
-};
-
-static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
{
- struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
- struct cn_handler_struct *cm;
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- struct drbd_conf *mdev;
- int retcode, rr;
- int reply_size = sizeof(struct cn_msg)
- + sizeof(struct drbd_nl_cfg_reply)
- + sizeof(short int);
-
- if (!try_module_get(THIS_MODULE)) {
- printk(KERN_ERR "drbd: try_module_get() failed!\n");
- return;
- }
+ const char *name = adm_ctx->resource_name;
+ if (!name || !name[0]) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+ return ERR_MANDATORY_TAG;
+ }
+ /* if we want to use these in sysfs/configfs/debugfs some day,
+ * we must not allow slashes */
+ if (strchr(name, '/')) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+ return ERR_INVALID_REQUEST;
+ }
+ return NO_ERROR;
+}
- if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
- retcode = ERR_PERM;
- goto fail;
- }
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
- mdev = ensure_mdev(nlp->drbd_minor,
- (nlp->flags & DRBD_NL_CREATE_DEVICE));
- if (!mdev) {
- retcode = ERR_MINOR_INVALID;
- goto fail;
- }
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (nlp->packet_type >= P_nl_after_last_packet) {
- retcode = ERR_PACKET_NR;
- goto fail;
+ set_res_opts_defaults(&res_opts);
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
}
- cm = cnd_table + nlp->packet_type;
+ retcode = drbd_check_resource_name(&adm_ctx);
+ if (retcode != NO_ERROR)
+ goto out;
- /* This may happen if packet number is 0: */
- if (cm->function == NULL) {
- retcode = ERR_PACKET_NR;
- goto fail;
+ if (adm_ctx.resource) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+ retcode = ERR_INVALID_REQUEST;
+ drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+ }
+ /* else: still NO_ERROR */
+ goto out;
}
- reply_size += cm->reply_body_size;
-
- /* allocation not in the IO path, cqueue thread context */
- cn_reply = kmalloc(reply_size, GFP_KERNEL);
- if (!cn_reply) {
+ /* not yet safe for genl_family.parallel_ops */
+ if (!conn_create(adm_ctx.resource_name, &res_opts))
retcode = ERR_NOMEM;
- goto fail;
- }
- reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
-
- reply->packet_type =
- cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
- reply->minor = nlp->drbd_minor;
- reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
- /* reply->tag_list; might be modified by cm->function. */
-
- rr = cm->function(mdev, nlp, reply);
-
- cn_reply->id = req->id;
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
- cn_reply->flags = 0;
-
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
-
- kfree(cn_reply);
- module_put(THIS_MODULE);
- return;
- fail:
- drbd_nl_send_reply(req, retcode);
- module_put(THIS_MODULE);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
-
-static unsigned short *
-__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
- unsigned short len, int nul_terminated)
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
{
- unsigned short l = tag_descriptions[tag_number(tag)].max_len;
- len = (len < l) ? len : l;
- put_unaligned(tag, tl++);
- put_unaligned(len, tl++);
- memcpy(tl, data, len);
- tl = (unsigned short*)((char*)tl + len);
- if (nul_terminated)
- *((char*)tl - 1) = 0;
- return tl;
-}
+ struct drbd_config_context adm_ctx;
+ struct drbd_genlmsghdr *dh = info->userhdr;
+ enum drbd_ret_code retcode;
-static unsigned short *
-tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
-{
- return __tl_add_blob(tl, tag, data, len, 0);
-}
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
-static unsigned short *
-tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
-{
- return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
-}
+ if (dh->minor > MINORMASK) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+ if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
-static unsigned short *
-tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
-{
- put_unaligned(tag, tl++);
- switch(tag_type(tag)) {
- case TT_INTEGER:
- put_unaligned(sizeof(int), tl++);
- put_unaligned(*(int *)val, (int *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(int));
- break;
- case TT_INT64:
- put_unaligned(sizeof(u64), tl++);
- put_unaligned(*(u64 *)val, (u64 *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(u64));
- break;
- default:
- /* someone did something stupid. */
- ;
+ /* drbd_adm_prepare made sure already
+ * that first_peer_device(device)->connection and device->vnr match the request. */
+ if (adm_ctx.device) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+ retcode = ERR_MINOR_EXISTS;
+ /* else: still NO_ERROR */
+ goto out;
}
- return tl;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_create_device(&adm_ctx, dh->minor);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct get_state_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
-
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
-
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_get_state;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ if (device->state.disk == D_DISKLESS &&
+ /* no need to be device->state.conn == C_STANDALONE &&
+ * we may want to delete a minor from a live replication group.
+ */
+ device->state.role == R_SECONDARY) {
+ _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ drbd_delete_device(device);
+ return NO_ERROR;
+ } else
+ return ERR_MINOR_CONFIGURED;
}
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct call_helper_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = tl_add_str(tl, T_helper, helper_name);
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_call_helper;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_del_minor(adm_ctx.device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e)
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
{
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- unsigned short *tl;
- struct page *page;
- unsigned len;
-
- if (!e)
- return;
- if (!reason || !reason[0])
- return;
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+ unsigned i;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ resource = adm_ctx.resource;
+ mutex_lock(&resource->adm_mutex);
+ /* demote */
+ for_each_connection(connection, resource) {
+ struct drbd_peer_device *peer_device;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+ goto out;
+ }
+ }
- /* apparently we have to memcpy twice, first to prepare the data for the
- * struct cn_msg, then within cn_netlink_send from the cn_msg to the
- * netlink skb. */
- /* receiver thread context, which is not in the writeout path (of this node),
- * but may be in the writeout path of the _other_ node.
- * GFP_NOIO to avoid potential "distributed deadlock". */
- cn_reply = kmalloc(
- sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct dump_ee_tag_len_struct)+
- sizeof(short int),
- GFP_NOIO);
-
- if (!cn_reply) {
- dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
- (unsigned long long)e->sector, e->size);
- return;
+ retcode = conn_try_disconnect(connection, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+ goto out;
+ }
}
- reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
- tl = reply->tag_list;
-
- tl = tl_add_str(tl, T_dump_ee_reason, reason);
- tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
- tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
- tl = tl_add_int(tl, T_ee_sector, &e->sector);
- tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
-
- put_unaligned(T_ee_data, tl++);
- put_unaligned(e->size, tl++);
-
- len = e->size;
- page = e->pages;
- page_chain_for_each(page) {
- void *d = kmap_atomic(page, KM_USER0);
- unsigned l = min_t(unsigned, len, PAGE_SIZE);
- memcpy(tl, d, l);
- kunmap_atomic(d, KM_USER0);
- tl = (unsigned short*)((char*)tl + l);
- len -= l;
+ /* detach */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_detach(device, 0);
+ if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+ goto out;
+ }
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
- cn_reply->ack = 0; // not used here.
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char*)tl - (char*)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_dump_ee;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- kfree(cn_reply);
-}
-
-void drbd_bcast_sync_progress(struct drbd_conf *mdev)
-{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct sync_progress_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
- unsigned long rs_left;
- unsigned int res;
- /* no local ref, no bitmap, no syncer progress, no broadcast. */
- if (!get_ldev(mdev))
- return;
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- put_ldev(mdev);
-
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ /* If we reach this, all volumes (of this connection) are Secondary,
+ * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+ * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+ for_each_connection(connection, resource)
+ drbd_thread_stop(&connection->worker);
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
+ /* Now, nothing can fail anymore */
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_sync_progress;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
+ /* delete volumes */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_del_minor(device);
+ if (retcode != NO_ERROR) {
+ /* "can not happen" */
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+ goto out;
+ }
+ }
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ list_del_rcu(&resource->resources);
+ synchronize_rcu();
+ drbd_free_resource(resource);
+ retcode = NO_ERROR;
+out:
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
}
-int __init drbd_nl_init(void)
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
{
- static struct cb_id cn_id_drbd;
- int err, try=10;
-
- cn_id_drbd.val = CN_VAL_DRBD;
- do {
- cn_id_drbd.idx = cn_idx;
- err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
- if (!err)
- break;
- cn_idx = (cn_idx + CN_IDX_STEP);
- } while (try--);
-
- if (err) {
- printk(KERN_ERR "drbd: cn_drbd failed to register\n");
- return err;
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ resource = adm_ctx.resource;
+ mutex_lock(&resource->adm_mutex);
+ for_each_connection(connection, resource) {
+ if (connection->cstate > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto out;
+ }
+ }
+ if (!idr_is_empty(&resource->devices)) {
+ retcode = ERR_RES_IN_USE;
+ goto out;
}
+ list_del_rcu(&resource->resources);
+ for_each_connection(connection, resource)
+ drbd_thread_stop(&connection->worker);
+ synchronize_rcu();
+ drbd_free_resource(resource);
+ retcode = NO_ERROR;
+out:
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}
-void drbd_nl_cleanup(void)
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
{
- static struct cb_id cn_id_drbd;
-
- cn_id_drbd.idx = cn_idx;
- cn_id_drbd.val = CN_VAL_DRBD;
-
- cn_del_callback(&cn_id_drbd);
-}
+ static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+ struct sk_buff *msg;
+ struct drbd_genlmsghdr *d_out;
+ unsigned seq;
+ int err = -ENOMEM;
+
+ if (sib->sib_reason == SIB_SYNC_PROGRESS) {
+ if (time_after(jiffies, device->rs_last_bcast + HZ))
+ device->rs_last_bcast = jiffies;
+ else
+ return;
+ }
-void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
-{
- char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- int rr;
+ seq = atomic_inc_return(&drbd_genl_seq);
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+ if (!msg)
+ goto failed;
- cn_reply->id = req->id;
+ err = -EMSGSIZE;
+ d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+ if (!d_out) /* cannot happen, but anyways. */
+ goto nla_put_failure;
+ d_out->minor = device_to_minor(device);
+ d_out->ret_code = NO_ERROR;
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
- cn_reply->flags = 0;
+ if (nla_put_status_info(msg, device, sib))
+ goto nla_put_failure;
+ genlmsg_end(msg, d_out);
+ err = drbd_genl_multicast_events(msg, 0);
+ /* msg has been consumed or freed in netlink_broadcast() */
+ if (err && err != -ESRCH)
+ goto failed;
- reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
- reply->ret_code = ret_code;
+ return;
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+nla_put_failure:
+ nlmsg_free(msg);
+failed:
+ drbd_err(device, "Error %d while broadcasting event. "
+ "Event seq:%u sib_reason:%u\n",
+ err, seq, sib->sib_reason);
}
-
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 00000000000..b2d4791498a
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+ struct nlattr *head = nla_data(nla);
+ int len = nla_len(nla);
+ int rem;
+
+ /*
+ * validate_nla (called from nla_parse_nested) ignores attributes
+ * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+ * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+ * flag set also, check and remove that flag before calling
+ * nla_parse_nested.
+ */
+
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+ nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+ if (nla_type(nla) > maxtype)
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy)
+{
+ int err;
+
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (!err)
+ err = nla_parse_nested(tb, maxtype, nla, policy);
+
+ return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+ int err;
+ /*
+ * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+ * we don't know about that attribute, reject all the nested
+ * attributes.
+ */
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (err)
+ return ERR_PTR(err);
+ return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 00000000000..679c2d5b453
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 7e6ac307e2d..89736bdbbc7 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -34,6 +34,7 @@
#include "drbd_int.h"
static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
struct proc_dir_entry *drbd_proc;
@@ -42,9 +43,22 @@ const struct file_operations drbd_proc_fops = {
.open = drbd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = drbd_proc_release,
};
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+ /* v is in kB/sec. We don't expect TiByte/sec yet. */
+ if (unlikely(v >= 1000000)) {
+ /* cool: > GiByte/s */
+ seq_printf(seq, "%ld,", v / 1000000);
+ v %= 1000000;
+ seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+ } else if (likely(v >= 1000))
+ seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+ else
+ seq_printf(seq, "%ld", v);
+}
/*lge
* progress bars shamelessly adapted from driver/md/md.c
@@ -52,14 +66,14 @@ const struct file_operations drbd_proc_fops = {
* [=====>..............] 33.5% (23456/123456)
* finish: 2:20:20 speed: 6,345 (6,456) K/sec
*/
-static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
{
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
int stalled = 0;
- drbd_get_syncer_progress(mdev, &rs_left, &res);
+ drbd_get_syncer_progress(device, &rs_left, &res);
x = res/50;
y = 20-x;
@@ -71,16 +85,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
seq_printf(seq, ".");
seq_printf(seq, "] ");
- seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
- /* if more than 1 GB display in MB */
- if (mdev->rs_total > 0x100000L)
- seq_printf(seq, "(%lu/%lu)M\n\t",
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ seq_printf(seq, "verified:");
+ else
+ seq_printf(seq, "sync'ed:");
+ seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+ /* if more than a few GB, display in MB */
+ if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+ seq_printf(seq, "(%lu/%lu)M",
(unsigned long) Bit2KB(rs_left >> 10),
- (unsigned long) Bit2KB(mdev->rs_total >> 10));
+ (unsigned long) Bit2KB(device->rs_total >> 10));
else
seq_printf(seq, "(%lu/%lu)K\n\t",
(unsigned long) Bit2KB(rs_left),
- (unsigned long) Bit2KB(mdev->rs_total));
+ (unsigned long) Bit2KB(device->rs_total));
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
@@ -94,48 +113,79 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
/* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
* at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
* least DRBD_SYNC_MARK_STEP time before it will be modified. */
- i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
- dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
- if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+ /* ------------------------ ~18s average ------------------------ */
+ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (dt > 180)
stalled = 1;
if (!dt)
dt++;
- db = mdev->rs_mark_left[i] - rs_left;
+ db = device->rs_mark_left[i] - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
rt / 3600, (rt % 3600) / 60, rt % 60);
- /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
dbdt = Bit2KB(db/dt);
- if (dbdt > 1000)
- seq_printf(seq, " speed: %ld,%03ld",
- dbdt/1000, dbdt % 1000);
- else
- seq_printf(seq, " speed: %ld", dbdt);
+ seq_printf(seq, " speed: ");
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " (");
+ /* ------------------------- ~3s average ------------------------ */
+ if (proc_details >= 1) {
+ /* this is what drbd_rs_should_slow_down() uses */
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " -- ");
+ }
+ /* --------------------- long term average ---------------------- */
/* mean speed since syncer started
* we do account for PausedSync periods */
- dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
if (dt == 0)
dt = 1;
- db = mdev->rs_total - rs_left;
+ db = device->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
- if (dbdt > 1000)
- seq_printf(seq, " (%ld,%03ld)",
- dbdt/1000, dbdt % 1000);
- else
- seq_printf(seq, " (%ld)", dbdt);
-
- if (mdev->state.conn == C_SYNC_TARGET) {
- if (mdev->c_sync_rate > 1000)
- seq_printf(seq, " want: %d,%03d",
- mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
- else
- seq_printf(seq, " want: %d", mdev->c_sync_rate);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, ")");
+
+ if (device->state.conn == C_SYNC_TARGET ||
+ device->state.conn == C_VERIFY_S) {
+ seq_printf(seq, " want: ");
+ seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
}
seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+ if (proc_details >= 1) {
+ /* 64 bit:
+ * we convert to sectors in the display below. */
+ unsigned long bm_bits = drbd_bm_bits(device);
+ unsigned long bit_pos;
+ unsigned long long stop_sector = 0;
+ if (device->state.conn == C_VERIFY_S ||
+ device->state.conn == C_VERIFY_T) {
+ bit_pos = bm_bits - device->ov_left;
+ if (verify_can_do_stop_sector(device))
+ stop_sector = device->ov_stop_sector;
+ } else
+ bit_pos = device->bm_resync_fo;
+ /* Total sectors may be slightly off for oddly
+ * sized devices. So what. */
+ seq_printf(seq,
+ "\t%3d%% sector pos: %llu/%llu",
+ (int)(bit_pos / (bm_bits/100+1)),
+ (unsigned long long)bit_pos * BM_SECT_PER_BIT,
+ (unsigned long long)bm_bits * BM_SECT_PER_BIT);
+ if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+ seq_printf(seq, " stop sector: %llu", stop_sector);
+ seq_printf(seq, "\n");
+ }
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -150,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
static int drbd_seq_show(struct seq_file *seq, void *v)
{
- int i, hole = 0;
+ int i, prev_i = -1;
const char *sn;
- struct drbd_conf *mdev;
+ struct drbd_device *device;
+ struct net_conf *nc;
+ char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
@@ -183,89 +235,99 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
oos .. known out-of-sync kB
*/
- for (i = 0; i < minor_count; i++) {
- mdev = minor_to_mdev(i);
- if (!mdev) {
- hole = 1;
- continue;
- }
- if (hole) {
- hole = 0;
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, device, i) {
+ if (prev_i != i - 1)
seq_printf(seq, "\n");
- }
+ prev_i = i;
- sn = drbd_conn_str(mdev->state.conn);
+ sn = drbd_conn_str(device->state.conn);
- if (mdev->state.conn == C_STANDALONE &&
- mdev->state.disk == D_DISKLESS &&
- mdev->state.role == R_SECONDARY) {
+ if (device->state.conn == C_STANDALONE &&
+ device->state.disk == D_DISKLESS &&
+ device->state.role == R_SECONDARY) {
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
+ /* reset device->congestion_reason */
+ bdi_rw_congested(&device->rq_queue->backing_dev_info);
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
- drbd_role_str(mdev->state.role),
- drbd_role_str(mdev->state.peer),
- drbd_disk_str(mdev->state.disk),
- drbd_disk_str(mdev->state.pdsk),
- (mdev->net_conf == NULL ? ' ' :
- (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
- is_susp(mdev->state) ? 's' : 'r',
- mdev->state.aftr_isp ? 'a' : '-',
- mdev->state.peer_isp ? 'p' : '-',
- mdev->state.user_isp ? 'u' : '-',
- mdev->congestion_reason ?: '-',
- test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
- mdev->send_cnt/2,
- mdev->recv_cnt/2,
- mdev->writ_cnt/2,
- mdev->read_cnt/2,
- mdev->al_writ_cnt,
- mdev->bm_writ_cnt,
- atomic_read(&mdev->local_cnt),
- atomic_read(&mdev->ap_pending_cnt) +
- atomic_read(&mdev->rs_pending_cnt),
- atomic_read(&mdev->unacked_cnt),
- atomic_read(&mdev->ap_bio_cnt),
- mdev->epochs,
- write_ordering_chars[mdev->write_ordering]
+ drbd_role_str(device->state.role),
+ drbd_role_str(device->state.peer),
+ drbd_disk_str(device->state.disk),
+ drbd_disk_str(device->state.pdsk),
+ wp,
+ drbd_suspended(device) ? 's' : 'r',
+ device->state.aftr_isp ? 'a' : '-',
+ device->state.peer_isp ? 'p' : '-',
+ device->state.user_isp ? 'u' : '-',
+ device->congestion_reason ?: '-',
+ test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+ device->send_cnt/2,
+ device->recv_cnt/2,
+ device->writ_cnt/2,
+ device->read_cnt/2,
+ device->al_writ_cnt,
+ device->bm_writ_cnt,
+ atomic_read(&device->local_cnt),
+ atomic_read(&device->ap_pending_cnt) +
+ atomic_read(&device->rs_pending_cnt),
+ atomic_read(&device->unacked_cnt),
+ atomic_read(&device->ap_bio_cnt),
+ first_peer_device(device)->connection->epochs,
+ write_ordering_chars[first_peer_device(device)->connection->write_ordering]
);
- seq_printf(seq, " oos:%lu\n",
- Bit2KB(drbd_bm_total_weight(mdev)));
+ seq_printf(seq, " oos:%llu\n",
+ Bit2KB((unsigned long long)
+ drbd_bm_total_weight(device)));
}
- if (mdev->state.conn == C_SYNC_SOURCE ||
- mdev->state.conn == C_SYNC_TARGET)
- drbd_syncer_progress(mdev, seq);
-
- if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
- seq_printf(seq, "\t%3d%% %lu/%lu\n",
- (int)((mdev->rs_total-mdev->ov_left) /
- (mdev->rs_total/100+1)),
- mdev->rs_total - mdev->ov_left,
- mdev->rs_total);
-
- if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
- lc_seq_printf_stats(seq, mdev->resync);
- lc_seq_printf_stats(seq, mdev->act_log);
- put_ldev(mdev);
+ if (device->state.conn == C_SYNC_SOURCE ||
+ device->state.conn == C_SYNC_TARGET ||
+ device->state.conn == C_VERIFY_S ||
+ device->state.conn == C_VERIFY_T)
+ drbd_syncer_progress(device, seq);
+
+ if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(seq, device->resync);
+ lc_seq_printf_stats(seq, device->act_log);
+ put_ldev(device);
}
if (proc_details >= 2) {
- if (mdev->resync) {
- lc_seq_dump_details(seq, mdev->resync, "rs_left",
+ if (device->resync) {
+ lc_seq_dump_details(seq, device->resync, "rs_left",
resync_dump_detail);
}
}
}
+ rcu_read_unlock();
return 0;
}
static int drbd_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, drbd_seq_show, PDE(inode)->data);
+ int err;
+
+ if (try_module_get(THIS_MODULE)) {
+ err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+ if (err)
+ module_put(THIS_MODULE);
+ return err;
+ }
+ return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return single_release(inode, file);
}
/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 00000000000..2da9104a385
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,307 @@
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+ /* receiver (data socket) */
+ P_DATA = 0x00,
+ P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
+ P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
+ P_BARRIER = 0x03,
+ P_BITMAP = 0x04,
+ P_BECOME_SYNC_TARGET = 0x05,
+ P_BECOME_SYNC_SOURCE = 0x06,
+ P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
+ P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
+ P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
+ P_SYNC_PARAM = 0x0a,
+ P_PROTOCOL = 0x0b,
+ P_UUIDS = 0x0c,
+ P_SIZES = 0x0d,
+ P_STATE = 0x0e,
+ P_SYNC_UUID = 0x0f,
+ P_AUTH_CHALLENGE = 0x10,
+ P_AUTH_RESPONSE = 0x11,
+ P_STATE_CHG_REQ = 0x12,
+
+ /* asender (meta socket */
+ P_PING = 0x13,
+ P_PING_ACK = 0x14,
+ P_RECV_ACK = 0x15, /* Used in protocol B */
+ P_WRITE_ACK = 0x16, /* Used in protocol C */
+ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
+ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
+ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
+ P_BARRIER_ACK = 0x1c,
+ P_STATE_CHG_REPLY = 0x1d,
+
+ /* "new" commands, no longer fitting into the ordering scheme above */
+
+ P_OV_REQUEST = 0x1e, /* data socket */
+ P_OV_REPLY = 0x1f,
+ P_OV_RESULT = 0x20, /* meta socket */
+ P_CSUM_RS_REQUEST = 0x21, /* data socket */
+ P_RS_IS_IN_SYNC = 0x22, /* meta socket */
+ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
+ /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
+ /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
+ P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
+ P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
+ P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+ P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
+ P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
+ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
+ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
+ /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+ /* REQ_DISCARD. We used "discard" in different contexts before,
+ * which is why I chose TRIM here, to disambiguate. */
+ P_TRIM = 0x31,
+
+ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+ P_MAX_OPT_CMD = 0x101,
+
+ /* special command ids for handshake */
+
+ P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
+ P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
+
+ P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ * (except block_id and barrier fields.
+ * these are pointers to local structs
+ * and have no relevance for the partner,
+ * which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+ u32 magic;
+ u16 command;
+ u16 length; /* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length;
+} __packed;
+
+struct p_header100 {
+ u32 magic;
+ u16 volume;
+ u16 command;
+ u32 length;
+ u32 pad;
+} __packed;
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
+#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* not used anymore */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
+
+struct p_data {
+ u64 sector; /* 64 bits sector number */
+ u64 block_id; /* to identify the request in protocol B&C */
+ u32 seq_num;
+ u32 dp_flags;
+} __packed;
+
+struct p_trim {
+ struct p_data p_data;
+ u32 size; /* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ * p_block_ack:
+ * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
+ * p_block_req:
+ * P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 seq_num;
+} __packed;
+
+struct p_block_req {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ * P_CONNECTION_FEATURES
+ * P_BARRIER
+ * P_BARRIER_ACK
+ * P_SYNC_PARAM
+ * ReportParams
+ */
+
+#define FF_TRIM 1
+
+struct p_connection_features {
+ u32 protocol_min;
+ u32 feature_flags;
+ u32 protocol_max;
+
+ /* should be more than enough for future enhancements
+ * for now, feature_flags and the reserved array shall be zero.
+ */
+
+ u32 _pad;
+ u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+ u32 barrier; /* barrier number _handle_ only */
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+ u32 barrier;
+ u32 set_size;
+} __packed;
+
+struct p_rs_param {
+ u32 resync_rate;
+
+ /* Since protocol version 88 and higher. */
+ char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+ u32 resync_rate;
+ /* protocol version 89: */
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+ u32 resync_rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+ CF_DISCARD_MY_DATA = 1,
+ CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+ u32 protocol;
+ u32 after_sb_0p;
+ u32 after_sb_1p;
+ u32 after_sb_2p;
+ u32 conn_flags;
+ u32 two_primaries;
+
+ /* Since protocol version 87 and higher. */
+ char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+ u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+ u64 uuid;
+} __packed;
+
+struct p_sizes {
+ u64 d_size; /* size of disk */
+ u64 u_size; /* user requested size */
+ u64 c_size; /* current exported size */
+ u32 max_bio_size; /* Maximal size of a BIO */
+ u16 queue_order_type; /* not yet implemented in DRBD*/
+ u16 dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+ u32 state;
+} __packed;
+
+struct p_req_state {
+ u32 mask;
+ u32 val;
+} __packed;
+
+struct p_req_state_reply {
+ u32 retcode;
+} __packed;
+
+struct p_drbd06_param {
+ u64 size;
+ u32 state;
+ u32 blksize;
+ u32 protocol;
+ u32 version;
+ u32 gen_cnt[5];
+ u32 bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+ u64 sector;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+ /* RLE_VLI_Bytes = 0,
+ * and other bit variants had been defined during
+ * algorithm evaluation. */
+ RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+ /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+ * (encoding & 0x80): polarity (set/unset) of first runlength
+ * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+ * used to pad up to head.length bytes
+ */
+ u8 encoding;
+
+ u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 24487d4fb20..5b17ec88ea0 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -44,21 +44,31 @@
#include <linux/string.h>
#include <linux/scatterlist.h>
#include "drbd_int.h"
+#include "drbd_protocol.h"
#include "drbd_req.h"
-
#include "drbd_vli.h"
+#define PRO_FEATURES (FF_TRIM)
+
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ unsigned int vnr;
+ void *data;
+};
+
enum finish_epoch {
FE_STILL_LIVE,
FE_DESTROYED,
FE_RECYCLED,
};
-static int drbd_do_handshake(struct drbd_conf *mdev);
-static int drbd_do_auth(struct drbd_conf *mdev);
-
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
-static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
@@ -142,11 +152,12 @@ static void page_chain_add(struct page **head,
*head = chain_first;
}
-static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+ unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
- int i = 0;
+ unsigned int i = 0;
/* Yes, testing drbd_pp_vacant outside the lock is racy.
* So what. It saves a spin_lock. */
@@ -175,7 +186,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
return page;
/* Not enough pages immediately available this time.
- * No need to jump around here, drbd_pp_alloc will retry this
+ * No need to jump around here, drbd_alloc_pages will retry this
* function "soon". */
if (page) {
tmp = page_chain_tail(page, NULL);
@@ -187,76 +198,80 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
return NULL;
}
-/* kick lower level device, if we have more than (arbitrary number)
- * reference counts on it, which typically are locally submitted io
- * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
-static void maybe_kick_lo(struct drbd_conf *mdev)
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+ struct list_head *to_be_freed)
{
- if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
- drbd_kick_lo(mdev);
-}
-
-static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
-{
- struct drbd_epoch_entry *e;
- struct list_head *le, *tle;
+ struct drbd_peer_request *peer_req, *tmp;
/* The EEs are always appended to the end of the list. Since
they are sent in order over the wire, they have to finish
in order. As soon as we see the first not finished we can
stop to examine the list... */
- list_for_each_safe(le, tle, &mdev->net_ee) {
- e = list_entry(le, struct drbd_epoch_entry, w.list);
- if (drbd_ee_has_active_page(e))
+ list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+ if (drbd_peer_req_has_active_page(peer_req))
break;
- list_move(le, to_be_freed);
+ list_move(&peer_req->w.list, to_be_freed);
}
}
-static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
+static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
{
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
- maybe_kick_lo(mdev);
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
}
/**
- * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
- * @mdev: DRBD device.
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @device: DRBD device.
* @number: number of pages requested
* @retry: whether to retry, if not enough pages are available right now
*
* Tries to allocate number pages, first from our own page pool, then from
- * the kernel, unless this allocation would exceed the max_buffers setting.
+ * the kernel.
* Possibly retry until DRBD frees sufficient pages somewhere else.
*
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
* Returns a page chain linked via page->private.
*/
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+ bool retry)
{
+ struct drbd_device *device = peer_device->device;
struct page *page = NULL;
+ struct net_conf *nc;
DEFINE_WAIT(wait);
+ unsigned int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000;
+ rcu_read_unlock();
- /* Yes, we may run up to @number over max_buffers. If we
- * follow it strictly, the admin will get it wrong anyways. */
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ if (atomic_read(&device->pp_in_use) < mxb)
+ page = __drbd_alloc_pages(device, number);
while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
- drbd_kick_lo_and_reclaim_net(mdev);
+ drbd_kick_lo_and_reclaim_net(device);
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ if (atomic_read(&device->pp_in_use) < mxb) {
+ page = __drbd_alloc_pages(device, number);
if (page)
break;
}
@@ -265,29 +280,33 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
break;
if (signal_pending(current)) {
- dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+ drbd_warn(device, "drbd_alloc_pages interrupted!\n");
break;
}
- schedule();
+ if (schedule_timeout(HZ/10) == 0)
+ mxb = UINT_MAX;
}
finish_wait(&drbd_pp_wait, &wait);
if (page)
- atomic_add(number, &mdev->pp_in_use);
+ atomic_add(number, &device->pp_in_use);
return page;
}
-/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
{
- atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
+ atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
int i;
- if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+ if (page == NULL)
+ return;
+
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
i = page_chain_free(page);
else {
struct page *tmp;
@@ -299,7 +318,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
}
i = atomic_sub_return(i, a);
if (i < 0)
- dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+ drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
}
@@ -309,255 +328,221 @@ You need to hold the req_lock:
_drbd_wait_ee_list_empty()
You must not have the req_lock:
- drbd_free_ee()
- drbd_alloc_ee()
- drbd_init_ee()
- drbd_release_ee()
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
drbd_ee_fix_bhs()
- drbd_process_done_ee()
+ drbd_finish_peer_reqs()
drbd_clear_done_ee()
drbd_wait_ee_list_empty()
*/
-struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local)
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
{
- struct drbd_epoch_entry *e;
- struct page *page;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+ struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+ if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
return NULL;
- e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
- if (!e) {
+ peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!peer_req) {
if (!(gfp_mask & __GFP_NOWARN))
- dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+ drbd_err(device, "%s: allocation failed\n", __func__);
return NULL;
}
- page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
- if (!page)
- goto fail;
+ if (has_payload && data_size) {
+ page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+ if (!page)
+ goto fail;
+ }
+
+ drbd_clear_interval(&peer_req->i);
+ peer_req->i.size = data_size;
+ peer_req->i.sector = sector;
+ peer_req->i.local = false;
+ peer_req->i.waiting = false;
- INIT_HLIST_NODE(&e->colision);
- e->epoch = NULL;
- e->mdev = mdev;
- e->pages = page;
- atomic_set(&e->pending_bios, 0);
- e->size = data_size;
- e->flags = 0;
- e->sector = sector;
- e->block_id = id;
+ peer_req->epoch = NULL;
+ peer_req->peer_device = peer_device;
+ peer_req->pages = page;
+ atomic_set(&peer_req->pending_bios, 0);
+ peer_req->flags = 0;
+ /*
+ * The block_id is opaque to the receiver. It is not endianness
+ * converted, and sent back to the sender unchanged.
+ */
+ peer_req->block_id = id;
- return e;
+ return peer_req;
fail:
- mempool_free(e, drbd_ee_mempool);
+ mempool_free(peer_req, drbd_ee_mempool);
return NULL;
}
-void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+ int is_net)
{
- if (e->flags & EE_HAS_DIGEST)
- kfree(e->digest);
- drbd_pp_free(mdev, e->pages, is_net);
- D_ASSERT(atomic_read(&e->pending_bios) == 0);
- D_ASSERT(hlist_unhashed(&e->colision));
- mempool_free(e, drbd_ee_mempool);
+ if (peer_req->flags & EE_HAS_DIGEST)
+ kfree(peer_req->digest);
+ drbd_free_pages(device, peer_req->pages, is_net);
+ D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+ mempool_free(peer_req, drbd_ee_mempool);
}
-int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
{
LIST_HEAD(work_list);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
int count = 0;
- int is_net = list == &mdev->net_ee;
+ int is_net = list == &device->net_ee;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
list_splice_init(list, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_some_ee(mdev, e, is_net);
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ __drbd_free_peer_req(device, peer_req, is_net);
count++;
}
return count;
}
-
/*
- * This function is called from _asender only_
- * but see also comments in _req_mod(,barrier_acked)
- * and receive_Barrier.
- *
- * Move entries from net_ee to done_ee, if ready.
- * Grab done_ee, call all callbacks, free the entries.
- * The callbacks typically send out ACKs.
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
*/
-static int drbd_process_done_ee(struct drbd_conf *mdev)
+static int drbd_finish_peer_reqs(struct drbd_device *device)
{
LIST_HEAD(work_list);
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
- int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+ struct drbd_peer_request *peer_req, *t;
+ int err = 0;
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
- list_splice_init(&mdev->done_ee, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ list_splice_init(&device->done_ee, &work_list);
+ spin_unlock_irq(&device->resource->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
/* possible callbacks here:
- * e_end_block, and e_end_resync_block, e_send_discard_ack.
+ * e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
*/
- list_for_each_entry_safe(e, t, &work_list, w.list) {
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ int err2;
+
/* list_del not necessary, next/prev members not touched */
- ok = e->w.cb(mdev, &e->w, !ok) && ok;
- drbd_free_ee(mdev, e);
+ err2 = peer_req->w.cb(&peer_req->w, !!err);
+ if (!err)
+ err = err2;
+ drbd_free_peer_req(device, peer_req);
}
- wake_up(&mdev->ee_wait);
+ wake_up(&device->ee_wait);
- return ok;
+ return err;
}
-void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
{
DEFINE_WAIT(wait);
/* avoids spin_lock/unlock
* and calling prepare_to_wait in the fast path */
while (!list_empty(head)) {
- prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mdev->req_lock);
- drbd_kick_lo(mdev);
- schedule();
- finish_wait(&mdev->ee_wait, &wait);
- spin_lock_irq(&mdev->req_lock);
+ prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ io_schedule();
+ finish_wait(&device->ee_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
}
}
-void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
{
- spin_lock_irq(&mdev->req_lock);
- _drbd_wait_ee_list_empty(mdev, head);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, head);
+ spin_unlock_irq(&device->resource->req_lock);
}
-/* see also kernel_accept; which is only present since 2.6.18.
- * also we want to log which part of it failed, exactly */
-static int drbd_accept(struct drbd_conf *mdev, const char **what,
- struct socket *sock, struct socket **newsock)
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
{
- struct sock *sk = sock->sk;
- int err = 0;
-
- *what = "listen";
- err = sock->ops->listen(sock, 5);
- if (err < 0)
- goto out;
-
- *what = "sock_create_lite";
- err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
- newsock);
- if (err < 0)
- goto out;
-
- *what = "accept";
- err = sock->ops->accept(sock, *newsock, 0);
- if (err < 0) {
- sock_release(*newsock);
- *newsock = NULL;
- goto out;
- }
- (*newsock)->ops = sock->ops;
-
-out:
- return err;
-}
-
-static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, int flags)
-{
- mm_segment_t oldfs;
struct kvec iov = {
.iov_base = buf,
.iov_len = size,
};
struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&iov,
.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
};
- int rv;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
- set_fs(oldfs);
-
- return rv;
+ return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
}
-static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
{
- mm_segment_t oldfs;
- struct kvec iov = {
- .iov_base = buf,
- .iov_len = size,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&iov,
- .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
- };
int rv;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
+ rv = drbd_recv_short(connection->data.socket, buf, size, 0);
- for (;;) {
- rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
- if (rv == size)
- break;
+ if (rv < 0) {
+ if (rv == -ECONNRESET)
+ drbd_info(connection, "sock was reset by peer\n");
+ else if (rv != -ERESTARTSYS)
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+ } else if (rv == 0) {
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
- /* Note:
- * ECONNRESET other side closed the connection
- * ERESTARTSYS (on sock) we got a signal
- */
+ t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
- if (rv < 0) {
- if (rv == -ECONNRESET)
- dev_info(DEV, "sock was reset by peer\n");
- else if (rv != -ERESTARTSYS)
- dev_err(DEV, "sock_recvmsg returned %d\n", rv);
- break;
- } else if (rv == 0) {
- dev_info(DEV, "sock was shut down by peer\n");
- break;
- } else {
- /* signal came in, or peer/link went down,
- * after we read a partial message
- */
- /* D_ASSERT(signal_pending(current)); */
- break;
+ if (t)
+ goto out;
}
- };
-
- set_fs(oldfs);
+ drbd_info(connection, "sock was shut down by peer\n");
+ }
if (rv != size)
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+out:
return rv;
}
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv(connection, buf, size);
+ if (err != size) {
+ if (err >= 0)
+ err = -EIO;
+ } else
+ err = 0;
+ return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv_all(connection, buf, size);
+ if (err && !signal_pending(current))
+ drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+ return err;
+}
+
/* quoting tcp(7):
* On individual connections, the socket buffer size must be set prior to the
* listen(2) or connect(2) calls in order to have it take effect.
@@ -577,29 +562,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd,
}
}
-static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
{
const char *what;
struct socket *sock;
struct sockaddr_in6 src_in6;
- int err;
+ struct sockaddr_in6 peer_in6;
+ struct net_conf *nc;
+ int err, peer_addr_len, my_addr_len;
+ int sndbuf_size, rcvbuf_size, connect_int;
int disconnect_on_error = 1;
- if (!get_net_conf(mdev))
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
return NULL;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+ memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+ if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+ memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (err < 0) {
sock = NULL;
goto out;
}
sock->sk->sk_rcvtimeo =
- sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
- drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ sock->sk->sk_sndtimeo = connect_int * HZ;
+ drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
/* explicitly bind to the configured IP as source IP
* for the outgoing connections.
@@ -608,17 +614,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
* Make sure to use 0 as port number, so linux selects
* a free one dynamically.
*/
- memcpy(&src_in6, mdev->net_conf->my_addr,
- min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
- if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
- src_in6.sin6_port = 0;
- else
- ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
-
what = "bind before connect";
- err = sock->ops->bind(sock,
- (struct sockaddr *) &src_in6,
- mdev->net_conf->my_addr_len);
+ err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
if (err < 0)
goto out;
@@ -626,9 +623,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
* stay C_WF_CONNECTION, don't go Disconnecting! */
disconnect_on_error = 0;
what = "connect";
- err = sock->ops->connect(sock,
- (struct sockaddr *)mdev->net_conf->peer_addr,
- mdev->net_conf->peer_addr_len, 0);
+ err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
out:
if (err < 0) {
@@ -646,108 +641,219 @@ out:
disconnect_on_error = 0;
break;
default:
- dev_err(DEV, "%s failed, err = %d\n", what, err);
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
}
if (disconnect_on_error)
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
}
- put_net_conf(mdev);
+
return sock;
}
-static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+struct accept_wait_data {
+ struct drbd_connection *connection;
+ struct socket *s_listen;
+ struct completion door_bell;
+ void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
{
- int timeo, err;
- struct socket *s_estab = NULL, *s_listen;
+ struct accept_wait_data *ad = sk->sk_user_data;
+ void (*state_change)(struct sock *sk);
+
+ state_change = ad->original_sk_state_change;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ complete(&ad->door_bell);
+ state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+ int err, sndbuf_size, rcvbuf_size, my_addr_len;
+ struct sockaddr_in6 my_addr;
+ struct socket *s_listen;
+ struct net_conf *nc;
const char *what;
- if (!get_net_conf(mdev))
- return NULL;
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+ memcpy(&my_addr, &connection->my_addr, my_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
if (err) {
s_listen = NULL;
goto out;
}
- timeo = mdev->net_conf->try_connect_int * HZ;
- timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-
- s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
- s_listen->sk->sk_rcvtimeo = timeo;
- s_listen->sk->sk_sndtimeo = timeo;
- drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
what = "bind before listen";
- err = s_listen->ops->bind(s_listen,
- (struct sockaddr *) mdev->net_conf->my_addr,
- mdev->net_conf->my_addr_len);
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
if (err < 0)
goto out;
- err = drbd_accept(mdev, &what, s_listen, &s_estab);
+ ad->s_listen = s_listen;
+ write_lock_bh(&s_listen->sk->sk_callback_lock);
+ ad->original_sk_state_change = s_listen->sk->sk_state_change;
+ s_listen->sk->sk_state_change = drbd_incoming_connection;
+ s_listen->sk->sk_user_data = ad;
+ write_unlock_bh(&s_listen->sk->sk_callback_lock);
+ what = "listen";
+ err = s_listen->ops->listen(s_listen, 5);
+ if (err < 0)
+ goto out;
+
+ return 0;
out:
if (s_listen)
sock_release(s_listen);
if (err < 0) {
if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
- dev_err(DEV, "%s failed, err = %d\n", what, err);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
}
}
- put_net_conf(mdev);
- return s_estab;
+ return -EIO;
}
-static int drbd_send_fp(struct drbd_conf *mdev,
- struct socket *sock, enum drbd_packets cmd)
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
{
- struct p_header80 *h = &mdev->data.sbuf.header.h80;
-
- return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = ad->original_sk_state_change;
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
}
-static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
{
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
- int rr;
+ int timeo, connect_int, err = 0;
+ struct socket *s_estab = NULL;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ timeo = connect_int * HZ;
+ /* 28.5% random jitter */
+ timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
- rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+ err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+ if (err <= 0)
+ return NULL;
+
+ err = kernel_accept(ad->s_listen, &s_estab, 0);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ drbd_err(connection, "accept failed, err = %d\n", err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
- if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
- return be16_to_cpu(h->command);
+ if (s_estab)
+ unregister_state_change(s_estab->sk, ad);
- return 0xffff;
+ return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd)
+{
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+ unsigned int header_size = drbd_header_size(connection);
+ struct packet_info pi;
+ int err;
+
+ err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+ if (err != header_size) {
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ err = decode_header(connection, connection->data.rbuf, &pi);
+ if (err)
+ return err;
+ return pi.cmd;
}
/**
* drbd_socket_okay() - Free the socket if its connection is not okay
- * @mdev: DRBD device.
* @sock: pointer to the pointer to the socket.
*/
-static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+static int drbd_socket_okay(struct socket **sock)
{
int rr;
char tb[4];
if (!*sock)
- return FALSE;
+ return false;
- rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+ rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
if (rr > 0 || rr == -EAGAIN) {
- return TRUE;
+ return true;
} else {
sock_release(*sock);
*sock = NULL;
- return FALSE;
+ return false;
}
}
+/* Gets called if a connection is established, or if a new minor gets created
+ in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ atomic_set(&device->packet_seq, 0);
+ device->peer_seq = 0;
+
+ device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+ &peer_device->connection->cstate_mutex :
+ &device->own_state_mutex;
+
+ err = drbd_send_sync_param(peer_device);
+ if (!err)
+ err = drbd_send_sizes(peer_device, 0, 0);
+ if (!err)
+ err = drbd_send_uuids(peer_device);
+ if (!err)
+ err = drbd_send_current_state(peer_device);
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ clear_bit(RESIZE_PENDING, &device->flags);
+ atomic_set(&device->ap_in_flight, 0);
+ mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+ return err;
+}
/*
* return values:
@@ -757,239 +863,331 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
* no point in trying again, please go standalone.
* -2 We do not have a network config...
*/
-static int drbd_connect(struct drbd_conf *mdev)
+static int conn_connect(struct drbd_connection *connection)
{
- struct socket *s, *sock, *msock;
- int try, h, ok;
-
- D_ASSERT(!mdev->data.socket);
+ struct drbd_socket sock, msock;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *nc;
+ int vnr, timeout, h, ok;
+ bool discard_my_data;
+ enum drbd_state_rv rv;
+ struct accept_wait_data ad = {
+ .connection = connection,
+ .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+ };
- if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+ clear_bit(DISCONNECT_SENT, &connection->flags);
+ if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
return -2;
- clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+ mutex_init(&sock.mutex);
+ sock.sbuf = connection->data.sbuf;
+ sock.rbuf = connection->data.rbuf;
+ sock.socket = NULL;
+ mutex_init(&msock.mutex);
+ msock.sbuf = connection->meta.sbuf;
+ msock.rbuf = connection->meta.rbuf;
+ msock.socket = NULL;
- sock = NULL;
- msock = NULL;
+ /* Assume that the peer only understands protocol 80 until we know better. */
+ connection->agreed_pro_version = 80;
+
+ if (prepare_listen_socket(connection, &ad))
+ return 0;
do {
- for (try = 0;;) {
- /* 3 tries, this should take less than a second! */
- s = drbd_try_connect(mdev);
- if (s || ++try >= 3)
- break;
- /* give the other side time to call bind() & listen() */
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 10);
- }
+ struct socket *s;
+ s = drbd_try_connect(connection);
if (s) {
- if (!sock) {
- drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
- sock = s;
- s = NULL;
- } else if (!msock) {
- drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
- msock = s;
- s = NULL;
+ if (!sock.socket) {
+ sock.socket = s;
+ send_first_packet(connection, &sock, P_INITIAL_DATA);
+ } else if (!msock.socket) {
+ clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+ msock.socket = s;
+ send_first_packet(connection, &msock, P_INITIAL_META);
} else {
- dev_err(DEV, "Logic error in drbd_connect()\n");
+ drbd_err(connection, "Logic error in conn_connect()\n");
goto out_release_sockets;
}
}
- if (sock && msock) {
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 10);
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
+ if (sock.socket && msock.socket) {
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeout = nc->ping_timeo * HZ / 10;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeout);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
if (ok)
break;
}
retry:
- s = drbd_wait_for_connect(mdev);
+ s = drbd_wait_for_connect(connection, &ad);
if (s) {
- try = drbd_recv_fp(mdev, s);
- drbd_socket_okay(mdev, &sock);
- drbd_socket_okay(mdev, &msock);
- switch (try) {
- case P_HAND_SHAKE_S:
- if (sock) {
- dev_warn(DEV, "initial packet S crossed\n");
- sock_release(sock);
+ int fp = receive_first_packet(connection, s);
+ drbd_socket_okay(&sock.socket);
+ drbd_socket_okay(&msock.socket);
+ switch (fp) {
+ case P_INITIAL_DATA:
+ if (sock.socket) {
+ drbd_warn(connection, "initial packet S crossed\n");
+ sock_release(sock.socket);
+ sock.socket = s;
+ goto randomize;
}
- sock = s;
+ sock.socket = s;
break;
- case P_HAND_SHAKE_M:
- if (msock) {
- dev_warn(DEV, "initial packet M crossed\n");
- sock_release(msock);
+ case P_INITIAL_META:
+ set_bit(RESOLVE_CONFLICTS, &connection->flags);
+ if (msock.socket) {
+ drbd_warn(connection, "initial packet M crossed\n");
+ sock_release(msock.socket);
+ msock.socket = s;
+ goto randomize;
}
- msock = s;
- set_bit(DISCARD_CONCURRENT, &mdev->flags);
+ msock.socket = s;
break;
default:
- dev_warn(DEV, "Error receiving initial packet\n");
+ drbd_warn(connection, "Error receiving initial packet\n");
sock_release(s);
- if (random32() & 1)
+randomize:
+ if (prandom_u32() & 1)
goto retry;
}
}
- if (mdev->state.conn <= C_DISCONNECTING)
+ if (connection->cstate <= C_DISCONNECTING)
goto out_release_sockets;
if (signal_pending(current)) {
flush_signals(current);
smp_rmb();
- if (get_t_state(&mdev->receiver) == Exiting)
+ if (get_t_state(&connection->receiver) == EXITING)
goto out_release_sockets;
}
- if (sock && msock) {
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
- if (ok)
- break;
- }
- } while (1);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
+ } while (!ok);
+
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
- msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
- sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_allocation = GFP_NOIO;
- msock->sk->sk_allocation = GFP_NOIO;
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
- sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
- msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
/* NOT YET ...
- * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- * first set it to the P_HAND_SHAKE timeout,
+ * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
* which we set to 4x the configured ping_timeout. */
- sock->sk->sk_sndtimeo =
- sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
- msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
- /* we don't want delays.
- * we use TCP_CORK where apropriate, though */
- drbd_tcp_nodelay(sock);
- drbd_tcp_nodelay(msock);
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
+
+ msock.socket->sk->sk_sndtimeo = timeout;
- mdev->data.socket = sock;
- mdev->meta.socket = msock;
- mdev->last_received = jiffies;
+ /* we don't want delays.
+ * we use TCP_CORK where appropriate, though */
+ drbd_tcp_nodelay(sock.socket);
+ drbd_tcp_nodelay(msock.socket);
- D_ASSERT(mdev->asender.task == NULL);
+ connection->data.socket = sock.socket;
+ connection->meta.socket = msock.socket;
+ connection->last_received = jiffies;
- h = drbd_do_handshake(mdev);
+ h = drbd_do_features(connection);
if (h <= 0)
return h;
- if (mdev->cram_hmac_tfm) {
- /* drbd_request_state(mdev, NS(conn, WFAuth)); */
- switch (drbd_do_auth(mdev)) {
+ if (connection->cram_hmac_tfm) {
+ /* drbd_request_state(device, NS(conn, WFAuth)); */
+ switch (drbd_do_auth(connection)) {
case -1:
- dev_err(DEV, "Authentication of peer failed\n");
+ drbd_err(connection, "Authentication of peer failed\n");
return -1;
case 0:
- dev_err(DEV, "Authentication of peer failed, trying again.\n");
+ drbd_err(connection, "Authentication of peer failed, trying again.\n");
return 0;
}
}
- if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
- return 0;
+ connection->data.socket->sk->sk_sndtimeo = timeout;
+ connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+ return -1;
- atomic_set(&mdev->packet_seq, 0);
- mdev->peer_seq = 0;
+ /* Prevent a race between resync-handshake and
+ * being promoted to Primary.
+ *
+ * Grab and release the state mutex, so we know that any current
+ * drbd_set_role() is finished, and any incoming drbd_set_role
+ * will see the STATE_SENT flag, and wait for it to be cleared.
+ */
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_lock(peer_device->device->state_mutex);
- drbd_thread_start(&mdev->asender);
+ set_bit(STATE_SENT, &connection->flags);
- if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
- drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
- put_ldev(mdev);
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_unlock(peer_device->device->state_mutex);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ if (discard_my_data)
+ set_bit(DISCARD_MY_DATA, &device->flags);
+ else
+ clear_bit(DISCARD_MY_DATA, &device->flags);
+
+ drbd_connected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
}
+ rcu_read_unlock();
- if (!drbd_send_protocol(mdev))
- return -1;
- drbd_send_sync_param(mdev, &mdev->sync_conf);
- drbd_send_sizes(mdev, 0, 0);
- drbd_send_uuids(mdev);
- drbd_send_state(mdev);
- clear_bit(USE_DEGR_WFC_T, &mdev->flags);
- clear_bit(RESIZE_PENDING, &mdev->flags);
+ rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+ if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+ clear_bit(STATE_SENT, &connection->flags);
+ return 0;
+ }
- return 1;
+ drbd_thread_start(&connection->asender);
+
+ mutex_lock(&connection->resource->conf_update);
+ /* The discard_my_data flag is a single-shot modifier to the next
+ * connection attempt, the handshake of which is now well underway.
+ * No need for rcu style copying of the whole struct
+ * just to clear a single value. */
+ connection->net_conf->discard_my_data = 0;
+ mutex_unlock(&connection->resource->conf_update);
+
+ return h;
out_release_sockets:
- if (sock)
- sock_release(sock);
- if (msock)
- sock_release(msock);
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+ if (sock.socket)
+ sock_release(sock.socket);
+ if (msock.socket)
+ sock_release(msock.socket);
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
{
- union p_header *h = &mdev->data.rbuf.header;
- int r;
-
- r = drbd_recv(mdev, h, sizeof(*h));
- if (unlikely(r != sizeof(*h))) {
- dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
- return FALSE;
- }
-
- if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
- *cmd = be16_to_cpu(h->h80.command);
- *packet_size = be16_to_cpu(h->h80.length);
- } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
- *cmd = be16_to_cpu(h->h95.command);
- *packet_size = be32_to_cpu(h->h95.length);
+ unsigned int header_size = drbd_header_size(connection);
+
+ if (header_size == sizeof(struct p_header100) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+ struct p_header100 *h = header;
+ if (h->pad != 0) {
+ drbd_err(connection, "Header padding is not zero\n");
+ return -EINVAL;
+ }
+ pi->vnr = be16_to_cpu(h->volume);
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ } else if (header_size == sizeof(struct p_header95) &&
+ *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+ struct p_header95 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ pi->vnr = 0;
+ } else if (header_size == sizeof(struct p_header80) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+ struct p_header80 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be16_to_cpu(h->length);
+ pi->vnr = 0;
} else {
- dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->h80.magic),
- be16_to_cpu(h->h80.command),
- be16_to_cpu(h->h80.length));
- return FALSE;
+ drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+ be32_to_cpu(*(__be32 *)header),
+ connection->agreed_pro_version);
+ return -EINVAL;
}
- mdev->last_received = jiffies;
+ pi->data = header + header_size;
+ return 0;
+}
- return TRUE;
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+ void *buffer = connection->data.rbuf;
+ int err;
+
+ err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+ if (err)
+ return err;
+
+ err = decode_header(connection, buffer, pi);
+ connection->last_received = jiffies;
+
+ return err;
}
-static void drbd_flush(struct drbd_conf *mdev)
+static void drbd_flush(struct drbd_connection *connection)
{
int rv;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ if (connection->write_ordering >= WO_bdev_flush) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (!get_ldev(device))
+ continue;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ rv = blkdev_issue_flush(device->ldev->backing_bdev,
+ GFP_NOIO, NULL);
+ if (rv) {
+ drbd_info(device, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(connection, WO_drain_io);
+ }
+ put_ldev(device);
+ kref_put(&device->kref, drbd_destroy_device);
- if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
- rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
- NULL);
- if (rv) {
- dev_err(DEV, "local disk flush failed with status %d\n", rv);
- /* would rather check on EOPNOTSUPP, but that is not reliable.
- * don't try again for ANY return value != 0
- * if (rv == -EOPNOTSUPP) */
- drbd_bump_write_ordering(mdev, WO_drain_io);
+ rcu_read_lock();
+ if (rv)
+ break;
}
- put_ldev(mdev);
+ rcu_read_unlock();
}
}
/**
* drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @epoch: Epoch object.
* @ev: Epoch event.
*/
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
struct drbd_epoch *epoch,
enum epoch_event ev)
{
@@ -997,7 +1195,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
struct drbd_epoch *next_epoch;
enum finish_epoch rv = FE_STILL_LIVE;
- spin_lock(&mdev->epoch_lock);
+ spin_lock(&connection->epoch_lock);
do {
next_epoch = NULL;
@@ -1017,19 +1215,24 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
if (epoch_size != 0 &&
atomic_read(&epoch->active) == 0 &&
- test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
+ (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
if (!(ev & EV_CLEANUP)) {
- spin_unlock(&mdev->epoch_lock);
- drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
- spin_lock(&mdev->epoch_lock);
+ spin_unlock(&connection->epoch_lock);
+ drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+ spin_lock(&connection->epoch_lock);
}
- dec_unacked(mdev);
+#if 0
+ /* FIXME: dec unacked on connection, once we have
+ * something to count pending connection packets in. */
+ if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+ dec_unacked(epoch->connection);
+#endif
- if (mdev->current_epoch != epoch) {
+ if (connection->current_epoch != epoch) {
next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
list_del(&epoch->list);
ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
- mdev->epochs--;
+ connection->epochs--;
kfree(epoch);
if (rv == FE_STILL_LIVE)
@@ -1040,7 +1243,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
/* atomic_set(&epoch->active, 0); is already zero */
if (rv == FE_STILL_LIVE)
rv = FE_RECYCLED;
- wake_up(&mdev->ee_wait);
}
}
@@ -1050,104 +1252,159 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
epoch = next_epoch;
} while (1);
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&connection->epoch_lock);
return rv;
}
/**
* drbd_bump_write_ordering() - Fall back to an other write ordering method
- * @mdev: DRBD device.
+ * @connection: DRBD connection.
* @wo: Write ordering method to try.
*/
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
{
+ struct disk_conf *dc;
+ struct drbd_peer_device *peer_device;
enum write_ordering_e pwo;
+ int vnr;
static char *write_ordering_str[] = {
[WO_none] = "none",
[WO_drain_io] = "drain",
[WO_bdev_flush] = "flush",
};
- pwo = mdev->write_ordering;
+ pwo = connection->write_ordering;
wo = min(pwo, wo);
- if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
- wo = WO_drain_io;
- if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
- wo = WO_none;
- mdev->write_ordering = wo;
- if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
- dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (!get_ldev_if_state(device, D_ATTACHING))
+ continue;
+ dc = rcu_dereference(device->ldev->disk_conf);
+
+ if (wo == WO_bdev_flush && !dc->disk_flushes)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && !dc->disk_drain)
+ wo = WO_none;
+ put_ldev(device);
+ }
+ rcu_read_unlock();
+ connection->write_ordering = wo;
+ if (pwo != connection->write_ordering || wo == WO_bdev_flush)
+ drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
}
/**
- * drbd_submit_ee()
- * @mdev: DRBD device.
- * @e: epoch entry
+ * drbd_submit_peer_request()
+ * @device: DRBD device.
+ * @peer_req: peer request
* @rw: flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ * single page to an empty bio (which should never happen and likely indicates
+ * that the lower level IO stack is in some way broken). This has been observed
+ * on certain Xen deployments.
*/
/* TODO allocate from our own bio_set. */
-int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type)
+int drbd_submit_peer_request(struct drbd_device *device,
+ struct drbd_peer_request *peer_req,
+ const unsigned rw, const int fault_type)
{
struct bio *bios = NULL;
struct bio *bio;
- struct page *page = e->pages;
- sector_t sector = e->sector;
- unsigned ds = e->size;
+ struct page *page = peer_req->pages;
+ sector_t sector = peer_req->i.sector;
+ unsigned ds = peer_req->i.size;
unsigned n_bios = 0;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+ int err = -ENOMEM;
+
+ if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* wait for all pending IO completions, before we start
+ * zeroing things out. */
+ conn_wait_active_ee_empty(first_peer_device(device)->connection);
+ if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+ sector, ds >> 9, GFP_NOIO))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+ return 0;
+ }
+
+ /* Discards don't have any payload.
+ * But the scsi layer still expects a bio_vec it can use internally,
+ * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
+ if (peer_req->flags & EE_IS_TRIM)
+ nr_pages = 1;
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
- * request in more than one bio. */
+ * request in more than one bio.
+ *
+ * Plain bio_alloc is good enough here, this is no DRBD internally
+ * generated bio, but a bio allocated on behalf of the peer.
+ */
next_bio:
bio = bio_alloc(GFP_NOIO, nr_pages);
if (!bio) {
- dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
+ drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
goto fail;
}
- /* > e->sector, unless this is the first bio */
- bio->bi_sector = sector;
- bio->bi_bdev = mdev->ldev->backing_bdev;
- /* we special case some flags in the multi-bio case, see below
- * (REQ_UNPLUG) */
+ /* > peer_req->i.sector, unless this is the first bio */
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = device->ldev->backing_bdev;
bio->bi_rw = rw;
- bio->bi_private = e;
- bio->bi_end_io = drbd_endio_sec;
+ bio->bi_private = peer_req;
+ bio->bi_end_io = drbd_peer_request_endio;
bio->bi_next = bios;
bios = bio;
++n_bios;
+ if (rw & REQ_DISCARD) {
+ bio->bi_iter.bi_size = ds;
+ goto submit;
+ }
+
page_chain_for_each(page) {
unsigned len = min_t(unsigned, ds, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) {
- /* a single page must always be possible! */
- BUG_ON(bio->bi_vcnt == 0);
+ /* A single page must always be possible!
+ * But in case it fails anyways,
+ * we deal with it, and complain (below). */
+ if (bio->bi_vcnt == 0) {
+ drbd_err(device,
+ "bio_add_page failed for len=%u, "
+ "bi_vcnt=0 (bi_sector=%llu)\n",
+ len, (uint64_t)bio->bi_iter.bi_sector);
+ err = -ENOSPC;
+ goto fail;
+ }
goto next_bio;
}
ds -= len;
sector += len >> 9;
--nr_pages;
}
- D_ASSERT(page == NULL);
- D_ASSERT(ds == 0);
+ D_ASSERT(device, ds == 0);
+submit:
+ D_ASSERT(device, page == NULL);
- atomic_set(&e->pending_bios, n_bios);
+ atomic_set(&peer_req->pending_bios, n_bios);
do {
bio = bios;
bios = bios->bi_next;
bio->bi_next = NULL;
- /* strip off REQ_UNPLUG unless it is the last bio */
- if (bios)
- bio->bi_rw &= ~REQ_UNPLUG;
-
- drbd_generic_make_request(mdev, fault_type, bio);
+ drbd_generic_make_request(device, fault_type, bio);
} while (bios);
- maybe_kick_lo(mdev);
return 0;
fail:
@@ -1156,32 +1413,68 @@ fail:
bios = bios->bi_next;
bio_put(bio);
}
- return -ENOMEM;
+ return err;
}
-static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
{
- int rv;
- struct p_barrier *p = &mdev->data.rbuf.barrier;
- struct drbd_epoch *epoch;
+ struct drbd_interval *i = &peer_req->i;
+
+ drbd_remove_interval(&device->write_requests, i);
+ drbd_clear_interval(i);
- inc_unacked(mdev);
+ /* Wake up any processes waiting for this peer request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
+}
- if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
- drbd_kick_lo(mdev);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
- mdev->current_epoch->barrier_nr = p->barrier;
- rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_wait_ee_list_empty(device, &device->active_ee);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+ return idr_find(&connection->peer_devices, volume_number);
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+ int rv;
+ struct p_barrier *p = pi->data;
+ struct drbd_epoch *epoch;
+
+ /* FIXME these are unacked on connection,
+ * not a specific (peer)device.
+ */
+ connection->current_epoch->barrier_nr = p->barrier;
+ connection->current_epoch->connection = connection;
+ rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
* the activity log, which means it would not be resynced in case the
* R_PRIMARY crashes now.
* Therefore we must send the barrier_ack after the barrier request was
* completed. */
- switch (mdev->write_ordering) {
+ switch (connection->write_ordering) {
case WO_none:
if (rv == FE_RECYCLED)
- return TRUE;
+ return 0;
/* receiver context, in the writeout path of the other node.
* avoid potential distributed deadlock */
@@ -1189,84 +1482,90 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
if (epoch)
break;
else
- dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+ drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
/* Fall through */
case WO_bdev_flush:
case WO_drain_io:
- drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- drbd_flush(mdev);
+ conn_wait_active_ee_empty(connection);
+ drbd_flush(connection);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
if (epoch)
break;
}
- epoch = mdev->current_epoch;
- wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
-
- D_ASSERT(atomic_read(&epoch->active) == 0);
- D_ASSERT(epoch->flags == 0);
-
- return TRUE;
+ return 0;
default:
- dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
- return FALSE;
+ drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+ return -EIO;
}
epoch->flags = 0;
atomic_set(&epoch->epoch_size, 0);
atomic_set(&epoch->active, 0);
- spin_lock(&mdev->epoch_lock);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
- list_add(&epoch->list, &mdev->current_epoch->list);
- mdev->current_epoch = epoch;
- mdev->epochs++;
+ spin_lock(&connection->epoch_lock);
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &connection->current_epoch->list);
+ connection->current_epoch = epoch;
+ connection->epochs++;
} else {
/* The current_epoch got recycled while we allocated this one... */
kfree(epoch);
}
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&connection->epoch_lock);
- return TRUE;
+ return 0;
}
/* used from receive_RSDataReply (recv_resync_read)
* and from receive_Data */
-static struct drbd_epoch_entry *
-read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ struct packet_info *pi) __must_hold(local)
{
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ struct drbd_device *device = peer_device->device;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ struct drbd_peer_request *peer_req;
struct page *page;
- int dgs, ds, rr;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
+ int dgs, ds, err;
+ int data_size = pi->size;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
-
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
- rr, dgs);
+ struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+
+ dgs = 0;
+ if (!trim && peer_device->connection->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ /*
+ * FIXME: Receive the incoming digest into the receive buffer
+ * here, together with its struct p_data?
+ */
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
+ if (err)
return NULL;
- }
+ data_size -= dgs;
}
- data_size -= dgs;
+ if (trim) {
+ D_ASSERT(peer_device, data_size == 0);
+ data_size = be32_to_cpu(trim->size);
+ }
- ERR_IF(data_size & 0x1ff) return NULL;
- ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
+ if (!expect(IS_ALIGNED(data_size, 512)))
+ return NULL;
+ /* prepare for larger trim requests. */
+ if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+ return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
if (sector + (data_size>>9) > capacity) {
- dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
+ drbd_err(device, "request from peer beyond end of local disk: "
+ "capacity: %llus < sector: %llus + size: %u\n",
(unsigned long long)capacity,
(unsigned long long)sector, data_size);
return NULL;
@@ -1275,308 +1574,419 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
- if (!e)
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+ if (!peer_req)
return NULL;
+ if (trim)
+ return peer_req;
+
ds = data_size;
- page = e->pages;
+ page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
data = kmap(page);
- rr = drbd_recv(mdev, data, len);
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
- dev_err(DEV, "Fault injection: Corrupting data on receive\n");
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+ drbd_err(device, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
kunmap(page);
- if (rr != len) {
- drbd_free_ee(mdev, e);
- dev_warn(DEV, "short read receiving data: read %d expected %d\n",
- rr, len);
+ if (err) {
+ drbd_free_peer_req(device, peer_req);
return NULL;
}
- ds -= rr;
+ ds -= len;
}
if (dgs) {
- drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+ drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
- dev_err(DEV, "Digest integrity check FAILED.\n");
- drbd_bcast_ee(mdev, "digest failed",
- dgs, dig_in, dig_vv, e);
- drbd_free_ee(mdev, e);
+ drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+ (unsigned long long)sector, data_size);
+ drbd_free_peer_req(device, peer_req);
return NULL;
}
}
- mdev->recv_cnt += data_size>>9;
- return e;
+ device->recv_cnt += data_size>>9;
+ return peer_req;
}
/* drbd_drain_block() just takes a data block
* out of the socket input buffer, and discards it.
*/
-static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
{
struct page *page;
- int rr, rv = 1;
+ int err = 0;
void *data;
if (!data_size)
- return TRUE;
+ return 0;
- page = drbd_pp_alloc(mdev, 1, 1);
+ page = drbd_alloc_pages(peer_device, 1, 1);
data = kmap(page);
while (data_size) {
- rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
- if (rr != min_t(int, data_size, PAGE_SIZE)) {
- rv = 0;
- dev_warn(DEV, "short read receiving data: read %d expected %d\n",
- rr, min_t(int, data_size, PAGE_SIZE));
+ unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (err)
break;
- }
- data_size -= rr;
+ data_size -= len;
}
kunmap(page);
- drbd_pp_free(mdev, page, 0);
- return rv;
+ drbd_free_pages(peer_device->device, page, 0);
+ return err;
}
-static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
sector_t sector, int data_size)
{
- struct bio_vec *bvec;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
struct bio *bio;
- int dgs, rr, i, expect;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
-
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+ int dgs, err, expect;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
- rr, dgs);
- return 0;
- }
+ dgs = 0;
+ if (peer_device->connection->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
+ if (err)
+ return err;
+ data_size -= dgs;
}
- data_size -= dgs;
-
/* optimistically update recv_cnt. if receiving fails below,
* we disconnect anyways, and counters will be reset. */
- mdev->recv_cnt += data_size>>9;
+ peer_device->device->recv_cnt += data_size>>9;
bio = req->master_bio;
- D_ASSERT(sector == bio->bi_sector);
-
- bio_for_each_segment(bvec, bio, i) {
- expect = min_t(int, data_size, bvec->bv_len);
- rr = drbd_recv(mdev,
- kmap(bvec->bv_page)+bvec->bv_offset,
- expect);
- kunmap(bvec->bv_page);
- if (rr != expect) {
- dev_warn(DEV, "short read receiving data reply: "
- "read %d expected %d\n",
- rr, expect);
- return 0;
- }
- data_size -= rr;
+ D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+ bio_for_each_segment(bvec, bio, iter) {
+ void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+ expect = min_t(int, data_size, bvec.bv_len);
+ err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+ kunmap(bvec.bv_page);
+ if (err)
+ return err;
+ data_size -= expect;
}
if (dgs) {
- drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
- dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
- return 0;
+ drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+ return -EINVAL;
}
}
- D_ASSERT(data_size == 0);
- return 1;
+ D_ASSERT(peer_device->device, data_size == 0);
+ return 0;
}
-/* e_end_resync_block() is called via
- * drbd_process_done_ee() by asender only */
-static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok;
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err;
- D_ASSERT(hlist_unhashed(&e->colision));
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- drbd_set_in_sync(mdev, sector, e->size);
- ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ drbd_set_in_sync(device, sector, peer_req->i.size);
+ err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
} else {
/* Record failure to sync */
- drbd_rs_failed_io(mdev, sector, e->size);
+ drbd_rs_failed_io(device, sector, peer_req->i.size);
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
}
- dec_unacked(mdev);
+ dec_unacked(device);
- return ok;
+ return err;
}
-static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+ struct packet_info *pi) __releases(local)
{
- struct drbd_epoch_entry *e;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
- e = read_in_block(mdev, ID_SYNCER, sector, data_size);
- if (!e)
+ peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+ if (!peer_req)
goto fail;
- dec_rs_pending(mdev);
+ dec_rs_pending(device);
- inc_unacked(mdev);
+ inc_unacked(device);
/* corresponding dec_unacked() in e_end_resync_block()
* respective _drbd_clear_done_ee */
- e->w.cb = e_end_resync_block;
+ peer_req->w.cb = e_end_resync_block;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->sync_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
+ list_add(&peer_req->w.list, &device->sync_ee);
+ spin_unlock_irq(&device->resource->req_lock);
- atomic_add(data_size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
- return TRUE;
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return 0;
- /* drbd_submit_ee currently fails for one reason only:
- * not being able to allocate enough bios.
- * Is dropping the connection going to help? */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(device, peer_req);
fail:
- put_ldev(mdev);
- return FALSE;
+ put_ldev(device);
+ return -EIO;
}
-static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+ sector_t sector, bool missing_ok, const char *func)
{
struct drbd_request *req;
+
+ /* Request object according to our peer */
+ req = (struct drbd_request *)(unsigned long)id;
+ if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+ return req;
+ if (!missing_ok) {
+ drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+ (unsigned long)id, (unsigned long long)sector);
+ }
+ return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct drbd_request *req;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
sector = be64_to_cpu(p->sector);
- spin_lock_irq(&mdev->req_lock);
- req = _ar_id_to_req(mdev, p->block_id, sector);
- spin_unlock_irq(&mdev->req_lock);
- if (unlikely(!req)) {
- dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
- return FALSE;
- }
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (unlikely(!req))
+ return -EIO;
- /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
+ /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
* special casing it there for the various failure cases.
* still no race with drbd_fail_pending_reads */
- ok = recv_dless_read(mdev, req, sector, data_size);
-
- if (ok)
- req_mod(req, data_received);
+ err = recv_dless_read(peer_device, req, sector, pi->size);
+ if (!err)
+ req_mod(req, DATA_RECEIVED);
/* else: nothing. handled from drbd_disconnect...
* I don't think we may complete this just yet
* in case we are "on-disconnect: freeze" */
- return ok;
+ return err;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
sector = be64_to_cpu(p->sector);
- D_ASSERT(p->block_id == ID_SYNCER);
+ D_ASSERT(device, p->block_id == ID_SYNCER);
- if (get_ldev(mdev)) {
+ if (get_ldev(device)) {
/* data is submitted to disk within recv_resync_read.
* corresponding put_ldev done below on error,
- * or in drbd_endio_write_sec. */
- ok = recv_resync_read(mdev, sector, data_size);
+ * or in drbd_peer_request_endio. */
+ err = recv_resync_read(peer_device, sector, pi);
} else {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Can not write resync data to local disk.\n");
+ drbd_err(device, "Can not write resync data to local disk.\n");
- ok = drbd_drain_block(mdev, data_size);
+ err = drbd_drain_block(peer_device, pi->size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
}
- atomic_add(data_size >> 9, &mdev->rs_sect_in);
+ atomic_add(pi->size >> 9, &device->rs_sect_in);
- return ok;
+ return err;
}
-/* e_end_block() is called via drbd_process_done_ee().
- * this means this function only runs in the asender thread
+static void restart_conflicting_writes(struct drbd_device *device,
+ sector_t sector, int size)
+{
+ struct drbd_interval *i;
+ struct drbd_request *req;
+
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED))
+ continue;
+ /* as it is RQ_POSTPONED, this will cause it to
+ * be queued on the retry workqueue. */
+ __req_mod(req, CONFLICT_RESOLVED, NULL);
+ }
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
*/
-static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int e_end_block(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok = 1, pcmd;
-
- if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
- mdev->state.conn <= C_PAUSED_SYNC_T &&
- e->flags & EE_MAY_SET_IN_SYNC) ?
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err = 0, pcmd;
+
+ if (peer_req->flags & EE_SEND_WRITE_ACK) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T &&
+ peer_req->flags & EE_MAY_SET_IN_SYNC) ?
P_RS_WRITE_ACK : P_WRITE_ACK;
- ok &= drbd_send_ack(mdev, pcmd, e);
+ err = drbd_send_ack(peer_device, pcmd, peer_req);
if (pcmd == P_RS_WRITE_ACK)
- drbd_set_in_sync(mdev, sector, e->size);
+ drbd_set_in_sync(device, sector, peer_req->i.size);
} else {
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
/* we expect it to be marked out of sync anyways...
* maybe assert this? */
}
- dec_unacked(mdev);
+ dec_unacked(device);
}
/* we delete from the conflict detection hash _after_ we sent out the
* P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
- if (mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->colision));
- hlist_del_init(&e->colision);
- spin_unlock_irq(&mdev->req_lock);
- } else {
- D_ASSERT(hlist_unhashed(&e->colision));
- }
+ if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+ spin_lock_irq(&device->resource->req_lock);
+ D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ if (peer_req->flags & EE_RESTART_REQUESTS)
+ restart_conflicting_writes(device, sector, peer_req->i.size);
+ spin_unlock_irq(&device->resource->req_lock);
+ } else
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+ drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+ return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ int err;
- drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+ err = drbd_send_ack(peer_device, ack, peer_req);
+ dec_unacked(peer_device->device);
- return ok;
+ return err;
}
-static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int e_send_superseded(struct drbd_work *w, int unused)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- int ok = 1;
+ return e_send_ack(w, P_SUPERSEDED);
+}
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_connection *connection = peer_req->peer_device->connection;
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->colision));
- hlist_del_init(&e->colision);
- spin_unlock_irq(&mdev->req_lock);
+ return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+ P_RETRY_WRITE : P_SUPERSEDED);
+}
- dec_unacked(mdev);
+static bool seq_greater(u32 a, u32 b)
+{
+ /*
+ * We assume 32-bit wrap-around here.
+ * For 24-bit wrap-around, we would have to shift:
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)a - (s32)b > 0;
+}
- return ok;
+static u32 seq_max(u32 a, u32 b)
+{
+ return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+ struct drbd_device *device = peer_device->device;
+ unsigned int newest_peer_seq;
+
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+ spin_lock(&device->peer_seq_lock);
+ newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+ device->peer_seq = newest_peer_seq;
+ spin_unlock(&device->peer_seq_lock);
+ /* wake up only if we actually changed device->peer_seq */
+ if (peer_seq == newest_peer_seq)
+ wake_up(&device->seq_wait);
+ }
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ struct drbd_peer_request *rs_req;
+ bool rv = 0;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+ if (overlaps(peer_req->i.sector, peer_req->i.size,
+ rs_req->i.sector, rs_req->i.size)) {
+ rv = 1;
+ break;
+ }
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
}
/* Called from receive_Data.
@@ -1588,9 +1998,9 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
*
* Note: we don't care for Ack packets overtaking P_DATA packets.
*
- * In case packet_seq is larger than mdev->peer_seq number, there are
+ * In case packet_seq is larger than device->peer_seq number, there are
* outstanding packets on the msock. We wait for them to arrive.
- * In case we are the logically next packet, we update mdev->peer_seq
+ * In case we are the logically next packet, we update device->peer_seq
* ourselves. Correctly handles 32bit wrap around.
*
* Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
@@ -1600,276 +2010,351 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
*
* returns 0 if we may process the packet,
* -ERESTARTSYS if we were interrupted (by disconnect signal). */
-static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
{
+ struct drbd_device *device = peer_device->device;
DEFINE_WAIT(wait);
- unsigned int p_seq;
long timeout;
- int ret = 0;
- spin_lock(&mdev->peer_seq_lock);
+ int ret = 0, tp;
+
+ if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+ return 0;
+
+ spin_lock(&device->peer_seq_lock);
for (;;) {
- prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
- if (seq_le(packet_seq, mdev->peer_seq+1))
+ if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+ device->peer_seq = seq_max(device->peer_seq, peer_seq);
break;
+ }
+
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
- p_seq = mdev->peer_seq;
- spin_unlock(&mdev->peer_seq_lock);
- timeout = schedule_timeout(30*HZ);
- spin_lock(&mdev->peer_seq_lock);
- if (timeout == 0 && p_seq == mdev->peer_seq) {
+
+ rcu_read_lock();
+ tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+ rcu_read_unlock();
+
+ if (!tp)
+ break;
+
+ /* Only need to wait if two_primaries is enabled */
+ prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock(&device->peer_seq_lock);
+ rcu_read_lock();
+ timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+ rcu_read_unlock();
+ timeout = schedule_timeout(timeout);
+ spin_lock(&device->peer_seq_lock);
+ if (!timeout) {
ret = -ETIMEDOUT;
- dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+ drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
break;
}
}
- finish_wait(&mdev->seq_wait, &wait);
- if (mdev->peer_seq+1 == packet_seq)
- mdev->peer_seq++;
- spin_unlock(&mdev->peer_seq_lock);
+ spin_unlock(&device->peer_seq_lock);
+ finish_wait(&device->seq_wait, &wait);
return ret;
}
-static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(u32 dpf)
{
- if (mdev->agreed_pro_version >= 95)
- return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
- (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
- (dpf & DP_FUA ? REQ_FUA : 0) |
- (dpf & DP_FLUSH ? REQ_FUA : 0) |
- (dpf & DP_DISCARD ? REQ_DISCARD : 0);
- else
- return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
}
-/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+ unsigned int size)
{
- sector_t sector;
- struct drbd_epoch_entry *e;
- struct p_data *p = &mdev->data.rbuf.data;
- int rw = WRITE;
- u32 dp_flags;
-
- if (!get_ldev(mdev)) {
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Can not write mirrored data block "
- "to local disk.\n");
- spin_lock(&mdev->peer_seq_lock);
- if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
- mdev->peer_seq++;
- spin_unlock(&mdev->peer_seq_lock);
-
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
- atomic_inc(&mdev->current_epoch->epoch_size);
- return drbd_drain_block(mdev, data_size);
- }
+ struct drbd_interval *i;
- /* get_ldev(mdev) successful.
- * Corresponding put_ldev done either below (on various errors),
- * or in drbd_endio_write_sec, if we successfully submit the data at
- * the end of this function. */
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ struct drbd_request *req;
+ struct bio_and_error m;
- sector = be64_to_cpu(p->sector);
- e = read_in_block(mdev, p->block_id, sector, data_size);
- if (!e) {
- put_ldev(mdev);
- return FALSE;
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (!(req->rq_state & RQ_POSTPONED))
+ continue;
+ req->rq_state &= ~RQ_POSTPONED;
+ __req_mod(req, NEG_ACKED, &m);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (m.bio)
+ complete_master_bio(device, &m);
+ spin_lock_irq(&device->resource->req_lock);
+ goto repeat;
}
+}
- e->w.cb = e_end_block;
-
- spin_lock(&mdev->epoch_lock);
- e->epoch = mdev->current_epoch;
- atomic_inc(&e->epoch->epoch_size);
- atomic_inc(&e->epoch->active);
- spin_unlock(&mdev->epoch_lock);
+static int handle_write_conflicts(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_connection *connection = peer_req->peer_device->connection;
+ bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+ sector_t sector = peer_req->i.sector;
+ const unsigned int size = peer_req->i.size;
+ struct drbd_interval *i;
+ bool equal;
+ int err;
- dp_flags = be32_to_cpu(p->dp_flags);
- rw |= write_flags_to_bio(mdev, dp_flags);
+ /*
+ * Inserting the peer request into the write_requests tree will prevent
+ * new conflicting local requests from being added.
+ */
+ drbd_insert_interval(&device->write_requests, &peer_req->i);
- if (dp_flags & DP_MAY_SET_IN_SYNC)
- e->flags |= EE_MAY_SET_IN_SYNC;
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (i == &peer_req->i)
+ continue;
- /* I'm the receiver, I do hold a net_cnt reference. */
- if (!mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- } else {
- /* don't get the req_lock yet,
- * we may sleep in drbd_wait_peer_seq */
- const int size = e->size;
- const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
- DEFINE_WAIT(wait);
- struct drbd_request *i;
- struct hlist_node *n;
- struct hlist_head *slot;
- int first;
-
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- BUG_ON(mdev->ee_hash == NULL);
- BUG_ON(mdev->tl_hash == NULL);
-
- /* conflict detection and handling:
- * 1. wait on the sequence number,
- * in case this data packet overtook ACK packets.
- * 2. check our hash tables for conflicting requests.
- * we only need to walk the tl_hash, since an ee can not
- * have a conflict with an other ee: on the submitting
- * node, the corresponding req had already been conflicting,
- * and a conflicting req is never sent.
- *
- * Note: for two_primaries, we are protocol C,
- * so there cannot be any request that is DONE
- * but still on the transfer log.
- *
- * unconditionally add to the ee_hash.
- *
- * if no conflicting request is found:
- * submit.
- *
- * if any conflicting request is found
- * that has not yet been acked,
- * AND I have the "discard concurrent writes" flag:
- * queue (via done_ee) the P_DISCARD_ACK; OUT.
- *
- * if any conflicting request is found:
- * block the receiver, waiting on misc_wait
- * until no more conflicting requests are there,
- * or we get interrupted (disconnect).
- *
- * we do not just write after local io completion of those
- * requests, but only after req is done completely, i.e.
- * we wait for the P_DISCARD_ACK to arrive!
- *
- * then proceed normally, i.e. submit.
- */
- if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
- goto out_interrupted;
+ if (!i->local) {
+ /*
+ * Our peer has sent a conflicting remote request; this
+ * should not happen in a two-node setup. Wait for the
+ * earlier peer request to complete.
+ */
+ err = drbd_wait_misc(device, i);
+ if (err)
+ goto out;
+ goto repeat;
+ }
- spin_lock_irq(&mdev->req_lock);
-
- hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev, sector);
- first = 1;
- for (;;) {
- int have_unacked = 0;
- int have_conflict = 0;
- prepare_to_wait(&mdev->misc_wait, &wait,
- TASK_INTERRUPTIBLE);
- hlist_for_each_entry(i, n, slot, colision) {
- if (OVERLAPS) {
- /* only ALERT on first iteration,
- * we may be woken up early... */
- if (first)
- dev_alert(DEV, "%s[%u] Concurrent local write detected!"
- " new: %llus +%u; pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- if (i->rq_state & RQ_NET_PENDING)
- ++have_unacked;
- ++have_conflict;
+ equal = i->sector == sector && i->size == size;
+ if (resolve_conflicts) {
+ /*
+ * If the peer request is fully contained within the
+ * overlapping request, it can be considered overwritten
+ * and thus superseded; otherwise, it will be retried
+ * once all overlapping requests have completed.
+ */
+ bool superseded = i->sector <= sector && i->sector +
+ (i->size >> 9) >= sector + (size >> 9);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u, "
+ "assuming %s came first\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size,
+ superseded ? "local" : "remote");
+
+ inc_unacked(device);
+ peer_req->w.cb = superseded ? e_send_superseded :
+ e_send_retry_write;
+ list_add_tail(&peer_req->w.list, &device->done_ee);
+ wake_asender(connection);
+
+ err = -ENOENT;
+ goto out;
+ } else {
+ struct drbd_request *req =
+ container_of(i, struct drbd_request, i);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size);
+
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED)) {
+ /*
+ * Wait for the node with the discard flag to
+ * decide if this request has been superseded
+ * or needs to be retried.
+ * Requests that have been superseded will
+ * disappear from the write_requests tree.
+ *
+ * In addition, wait for the conflicting
+ * request to finish locally before submitting
+ * the conflicting peer request.
+ */
+ err = drbd_wait_misc(device, &req->i);
+ if (err) {
+ _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+ fail_postponed_requests(device, sector, size);
+ goto out;
}
+ goto repeat;
}
-#undef OVERLAPS
- if (!have_conflict)
- break;
+ /*
+ * Remember to restart the conflicting requests after
+ * the new peer request has completed.
+ */
+ peer_req->flags |= EE_RESTART_REQUESTS;
+ }
+ }
+ err = 0;
- /* Discard Ack only for the _first_ iteration */
- if (first && discard && have_unacked) {
- dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
- (unsigned long long)sector);
- inc_unacked(mdev);
- e->w.cb = e_send_discard_ack;
- list_add_tail(&e->w.list, &mdev->done_ee);
+ out:
+ if (err)
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ return err;
+}
- spin_unlock_irq(&mdev->req_lock);
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ struct drbd_peer_request *peer_req;
+ struct p_data *p = pi->data;
+ u32 peer_seq = be32_to_cpu(p->seq_num);
+ int rw = WRITE;
+ u32 dp_flags;
+ int err, tp;
- /* we could probably send that P_DISCARD_ACK ourselves,
- * but I don't like the receiver using the msock */
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
- put_ldev(mdev);
- wake_asender(mdev);
- finish_wait(&mdev->misc_wait, &wait);
- return TRUE;
- }
+ if (!get_ldev(device)) {
+ int err2;
- if (signal_pending(current)) {
- hlist_del_init(&e->colision);
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+ atomic_inc(&connection->current_epoch->epoch_size);
+ err2 = drbd_drain_block(peer_device, pi->size);
+ if (!err)
+ err = err2;
+ return err;
+ }
- spin_unlock_irq(&mdev->req_lock);
+ /*
+ * Corresponding put_ldev done either below (on various errors), or in
+ * drbd_peer_request_endio, if we successfully submit the data at the
+ * end of this function.
+ */
- finish_wait(&mdev->misc_wait, &wait);
- goto out_interrupted;
- }
+ sector = be64_to_cpu(p->sector);
+ peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+ if (!peer_req) {
+ put_ldev(device);
+ return -EIO;
+ }
+
+ peer_req->w.cb = e_end_block;
+
+ dp_flags = be32_to_cpu(p->dp_flags);
+ rw |= wire_flags_to_bio(dp_flags);
+ if (pi->cmd == P_TRIM) {
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ peer_req->flags |= EE_IS_TRIM;
+ if (!blk_queue_discard(q))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ D_ASSERT(peer_device, peer_req->i.size > 0);
+ D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, peer_req->pages == NULL);
+ } else if (peer_req->pages == NULL) {
+ D_ASSERT(device, peer_req->i.size == 0);
+ D_ASSERT(device, dp_flags & DP_FLUSH);
+ }
- spin_unlock_irq(&mdev->req_lock);
- if (first) {
- first = 0;
- dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
- "sec=%llus\n", (unsigned long long)sector);
- } else if (discard) {
- /* we had none on the first iteration.
- * there must be none now. */
- D_ASSERT(have_unacked == 0);
+ if (dp_flags & DP_MAY_SET_IN_SYNC)
+ peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+ spin_lock(&connection->epoch_lock);
+ peer_req->epoch = connection->current_epoch;
+ atomic_inc(&peer_req->epoch->epoch_size);
+ atomic_inc(&peer_req->epoch->active);
+ spin_unlock(&connection->epoch_lock);
+
+ rcu_read_lock();
+ tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+ rcu_read_unlock();
+ if (tp) {
+ peer_req->flags |= EE_IN_INTERVAL_TREE;
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ if (err)
+ goto out_interrupted;
+ spin_lock_irq(&device->resource->req_lock);
+ err = handle_write_conflicts(device, peer_req);
+ if (err) {
+ spin_unlock_irq(&device->resource->req_lock);
+ if (err == -ENOENT) {
+ put_ldev(device);
+ return 0;
}
- schedule();
- spin_lock_irq(&mdev->req_lock);
+ goto out_interrupted;
+ }
+ } else {
+ update_peer_seq(peer_device, peer_seq);
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ /* if we use the zeroout fallback code, we process synchronously
+ * and we wait for all pending requests, respectively wait for
+ * active_ee to become empty in drbd_submit_peer_request();
+ * better not add ourselves here. */
+ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ list_add(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (device->state.conn == C_SYNC_TARGET)
+ wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+ if (peer_device->connection->agreed_pro_version < 100) {
+ rcu_read_lock();
+ switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
}
- finish_wait(&mdev->misc_wait, &wait);
+ rcu_read_unlock();
}
- list_add(&e->w.list, &mdev->active_ee);
- spin_unlock_irq(&mdev->req_lock);
-
- switch (mdev->net_conf->wire_protocol) {
- case DRBD_PROT_C:
- inc_unacked(mdev);
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
+ inc_unacked(device);
/* corresponding dec_unacked() in e_end_block()
* respective _drbd_clear_done_ee */
- break;
- case DRBD_PROT_B:
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
/* I really don't like it that the receiver thread
* sends on the msock, but anyways */
- drbd_send_ack(mdev, P_RECV_ACK, e);
- break;
- case DRBD_PROT_A:
- /* nothing to do */
- break;
+ drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
}
- if (mdev->state.pdsk < D_INCONSISTENT) {
+ if (device->state.pdsk < D_INCONSISTENT) {
/* In case we have the only disk of the cluster, */
- drbd_set_out_of_sync(mdev, e->sector, e->size);
- e->flags |= EE_CALL_AL_COMPLETE_IO;
- e->flags &= ~EE_MAY_SET_IN_SYNC;
- drbd_al_begin_io(mdev, e->sector);
+ drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+ peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+ drbd_al_begin_io(device, &peer_req->i, true);
}
- if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
- return TRUE;
+ err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+ if (!err)
+ return 0;
- /* drbd_submit_ee currently fails for one reason only:
- * not being able to allocate enough bios.
- * Is dropping the connection going to help? */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- hlist_del_init(&e->colision);
- spin_unlock_irq(&mdev->req_lock);
- if (e->flags & EE_CALL_AL_COMPLETE_IO)
- drbd_al_complete_io(mdev, e->sector);
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(device, &peer_req->i);
out_interrupted:
- /* yes, the epoch_size now is imbalanced.
- * but we drop the connection anyways, so we don't have a chance to
- * receive a barrier... atomic_inc(&mdev->epoch_size); */
- put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return FALSE;
+ drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return err;
}
/* We may throttle resync, if the lower device seems to be busy,
@@ -1883,139 +2368,181 @@ out_interrupted:
* The current sync rate used here uses only the most recent two step marks,
* to have a short time average so we can react faster.
*/
-int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+{
+ struct lc_element *tmp;
+ bool throttle = true;
+
+ if (!drbd_rs_c_min_rate_throttle(device))
+ return false;
+
+ spin_lock_irq(&device->al_lock);
+ tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+ if (tmp) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_PRIORITY, &bm_ext->flags))
+ throttle = false;
+ /* Do not slow down if app IO is already waiting for this extent */
+ }
+ spin_unlock_irq(&device->al_lock);
+
+ return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
{
- struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
unsigned long db, dt, dbdt;
+ unsigned int c_min_rate;
int curr_events;
- int throttle = 0;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
/* feature disabled? */
- if (mdev->sync_conf.c_min_rate == 0)
- return 0;
+ if (c_min_rate == 0)
+ return false;
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
(int)part_stat_read(&disk->part0, sectors[1]) -
- atomic_read(&mdev->rs_sect_ev);
- if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ atomic_read(&device->rs_sect_ev);
+ if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
unsigned long rs_left;
int i;
- mdev->rs_last_events = curr_events;
+ device->rs_last_events = curr_events;
/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
* approx. */
- i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
- rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
- dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ rs_left = device->ov_left;
+ else
+ rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+ dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
if (!dt)
dt++;
- db = mdev->rs_mark_left[i] - rs_left;
+ db = device->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
- if (dbdt > mdev->sync_conf.c_min_rate)
- throttle = 1;
+ if (dbdt > c_min_rate)
+ return true;
}
- return throttle;
+ return false;
}
-
-static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
sector_t sector;
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ sector_t capacity;
+ struct drbd_peer_request *peer_req;
struct digest_info *di = NULL;
int size, verb;
unsigned int fault_type;
- struct p_block_req *p = &mdev->data.rbuf.block_req;
+ struct p_block_req *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+ capacity = drbd_get_capacity(device->this_bdev);
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
- dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return FALSE;
+ return -EINVAL;
}
if (sector + (size>>9) > capacity) {
- dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return FALSE;
+ return -EINVAL;
}
- if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
+ if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
verb = 1;
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
- drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+ drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
break;
case P_RS_DATA_REQUEST:
case P_CSUM_RS_REQUEST:
case P_OV_REQUEST:
- drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+ drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
break;
case P_OV_REPLY:
verb = 0;
- dec_rs_pending(mdev);
- drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ dec_rs_pending(device);
+ drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
+ BUG();
}
if (verb && __ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Can not satisfy peer's read request, "
+ drbd_err(device, "Can not satisfy peer's read request, "
"no local data.\n");
/* drain possibly payload */
- return drbd_drain_block(mdev, digest_size);
+ return drbd_drain_block(peer_device, pi->size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
- if (!e) {
- put_ldev(mdev);
- return FALSE;
+ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+ true /* has real payload */, GFP_NOIO);
+ if (!peer_req) {
+ put_ldev(device);
+ return -ENOMEM;
}
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
- e->w.cb = w_e_end_data_req;
+ peer_req->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
goto submit;
case P_RS_DATA_REQUEST:
- e->w.cb = w_e_end_rsdata_req;
+ peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+ di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
if (!di)
goto out_free_e;
- di->digest_size = digest_size;
+ di->digest_size = pi->size;
di->digest = (((char *)di)+sizeof(struct digest_info));
- e->digest = di;
- e->flags |= EE_HAS_DIGEST;
+ peer_req->digest = di;
+ peer_req->flags |= EE_HAS_DIGEST;
- if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+ if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
goto out_free_e;
- if (cmd == P_CSUM_RS_REQUEST) {
- D_ASSERT(mdev->agreed_pro_version >= 89);
- e->w.cb = w_e_end_csum_rs_req;
- } else if (cmd == P_OV_REPLY) {
- e->w.cb = w_e_end_ov_reply;
- dec_rs_pending(mdev);
+ if (pi->cmd == P_CSUM_RS_REQUEST) {
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+ peer_req->w.cb = w_e_end_csum_rs_req;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ } else if (pi->cmd == P_OV_REPLY) {
+ /* track progress, we may need to throttle */
+ atomic_add(size >> 9, &device->rs_sect_in);
+ peer_req->w.cb = w_e_end_ov_reply;
+ dec_rs_pending(device);
/* drbd_rs_begin_io done when we sent this request,
* but accounting still needs to be done. */
goto submit_for_resync;
@@ -2023,23 +2550,27 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
break;
case P_OV_REQUEST:
- if (mdev->ov_start_sector == ~(sector_t)0 &&
- mdev->agreed_pro_version >= 90) {
- mdev->ov_start_sector = sector;
- mdev->ov_position = sector;
- mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
- dev_info(DEV, "Online Verify start sector: %llu\n",
+ if (device->ov_start_sector == ~(sector_t)0 &&
+ peer_device->connection->agreed_pro_version >= 90) {
+ unsigned long now = jiffies;
+ int i;
+ device->ov_start_sector = sector;
+ device->ov_position = sector;
+ device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+ device->rs_total = device->ov_left;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+ drbd_info(device, "Online Verify start sector: %llu\n",
(unsigned long long)sector);
}
- e->w.cb = w_e_end_ov_req;
+ peer_req->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
- fault_type = DRBD_FAULT_MAX;
- goto out_free_e;
+ BUG();
}
/* Throttle, drbd_rs_begin_io and submit should become asynchronous
@@ -2064,53 +2595,61 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
* we would also throttle its application reads.
* In that case, throttling is done on the SyncTarget only.
*/
- if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
- msleep(100);
- if (drbd_rs_begin_io(mdev, e->sector))
+ if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+ schedule_timeout_uninterruptible(HZ/10);
+ if (drbd_rs_begin_io(device, sector))
goto out_free_e;
submit_for_resync:
- atomic_add(size >> 9, &mdev->rs_sect_ev);
+ atomic_add(size >> 9, &device->rs_sect_ev);
submit:
- inc_unacked(mdev);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
-
- if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
- return TRUE;
-
- /* drbd_submit_ee currently fails for one reason only:
- * not being able to allocate enough bios.
- * Is dropping the connection going to help? */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ inc_unacked(device);
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
/* no drbd_rs_complete_io(), we are dropping the connection anyways */
out_free_e:
- put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return FALSE;
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return -EIO;
}
-static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
+/**
+ * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
{
+ struct drbd_device *device = peer_device->device;
int self, peer, rv = -100;
unsigned long ch_self, ch_peer;
+ enum drbd_after_sb_p after_sb_0p;
- self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
- peer = mdev->p_uuid[UI_BITMAP] & 1;
+ self = device->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = device->p_uuid[UI_BITMAP] & 1;
- ch_peer = mdev->p_uuid[UI_SIZE];
- ch_self = mdev->comm_bm_set;
+ ch_peer = device->p_uuid[UI_SIZE];
+ ch_self = device->comm_bm_set;
- switch (mdev->net_conf->after_sb_0p) {
+ rcu_read_lock();
+ after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+ rcu_read_unlock();
+ switch (after_sb_0p) {
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
case ASB_CALL_HELPER:
- dev_err(DEV, "Configuration error.\n");
+ case ASB_VIOLENTLY:
+ drbd_err(device, "Configuration error.\n");
break;
case ASB_DISCONNECT:
break;
@@ -2134,18 +2673,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
break;
}
/* Else fall through to one of the other strategies... */
- dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
+ drbd_warn(device, "Discard younger/older primary did not find a decision\n"
"Using discard-least-changes instead\n");
case ASB_DISCARD_ZERO_CHG:
if (ch_peer == 0 && ch_self == 0) {
- rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
? -1 : 1;
break;
} else {
if (ch_peer == 0) { rv = 1; break; }
if (ch_self == 0) { rv = -1; break; }
}
- if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
break;
case ASB_DISCARD_LEAST_CHG:
if (ch_self < ch_peer)
@@ -2154,7 +2693,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
rv = 1;
else /* ( ch_self == ch_peer ) */
/* Well, then use something else. */
- rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
? -1 : 1;
break;
case ASB_DISCARD_LOCAL:
@@ -2167,47 +2706,54 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
return rv;
}
-static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
+/**
+ * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
{
- int self, peer, hg, rv = -100;
-
- self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
- peer = mdev->p_uuid[UI_BITMAP] & 1;
-
- switch (mdev->net_conf->after_sb_1p) {
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_1p;
+
+ rcu_read_lock();
+ after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+ rcu_read_unlock();
+ switch (after_sb_1p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
case ASB_DISCARD_LOCAL:
case ASB_DISCARD_REMOTE:
- dev_err(DEV, "Configuration error.\n");
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
break;
case ASB_DISCONNECT:
break;
case ASB_CONSENSUS:
- hg = drbd_asb_recover_0p(mdev);
- if (hg == -1 && mdev->state.role == R_SECONDARY)
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_SECONDARY)
rv = hg;
- if (hg == 1 && mdev->state.role == R_PRIMARY)
+ if (hg == 1 && device->state.role == R_PRIMARY)
rv = hg;
break;
case ASB_VIOLENTLY:
- rv = drbd_asb_recover_0p(mdev);
+ rv = drbd_asb_recover_0p(peer_device);
break;
case ASB_DISCARD_SECONDARY:
- return mdev->state.role == R_PRIMARY ? 1 : -1;
+ return device->state.role == R_PRIMARY ? 1 : -1;
case ASB_CALL_HELPER:
- hg = drbd_asb_recover_0p(mdev);
- if (hg == -1 && mdev->state.role == R_PRIMARY) {
- self = drbd_set_role(mdev, R_SECONDARY, 0);
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_PRIMARY) {
+ enum drbd_state_rv rv2;
+
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
* we might be here in C_WF_REPORT_PARAMS which is transient.
* we do not need to wait for the after state change work either. */
- self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
- if (self != SS_SUCCESS) {
- drbd_khelper(mdev, "pri-lost-after-sb");
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
} else {
- dev_warn(DEV, "Successfully gave up primary role.\n");
+ drbd_warn(device, "Successfully gave up primary role.\n");
rv = hg;
}
} else
@@ -2217,14 +2763,19 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
return rv;
}
-static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
+/**
+ * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
{
- int self, peer, hg, rv = -100;
-
- self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
- peer = mdev->p_uuid[UI_BITMAP] & 1;
-
- switch (mdev->net_conf->after_sb_2p) {
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_2p;
+
+ rcu_read_lock();
+ after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+ rcu_read_unlock();
+ switch (after_sb_2p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
@@ -2232,24 +2783,27 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
case ASB_DISCARD_REMOTE:
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
- dev_err(DEV, "Configuration error.\n");
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
break;
case ASB_VIOLENTLY:
- rv = drbd_asb_recover_0p(mdev);
+ rv = drbd_asb_recover_0p(peer_device);
break;
case ASB_DISCONNECT:
break;
case ASB_CALL_HELPER:
- hg = drbd_asb_recover_0p(mdev);
+ hg = drbd_asb_recover_0p(peer_device);
if (hg == -1) {
+ enum drbd_state_rv rv2;
+
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
* we might be here in C_WF_REPORT_PARAMS which is transient.
* we do not need to wait for the after state change work either. */
- self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
- if (self != SS_SUCCESS) {
- drbd_khelper(mdev, "pri-lost-after-sb");
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
} else {
- dev_warn(DEV, "Successfully gave up primary role.\n");
+ drbd_warn(device, "Successfully gave up primary role.\n");
rv = hg;
}
} else
@@ -2259,14 +2813,14 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
return rv;
}
-static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
u64 bits, u64 flags)
{
if (!uuid) {
- dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
+ drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
return;
}
- dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
text,
(unsigned long long)uuid[UI_CURRENT],
(unsigned long long)uuid[UI_BITMAP],
@@ -2285,14 +2839,16 @@ static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
-2 C_SYNC_TARGET set BitMap
-100 after split brain, disconnect
-1000 unrelated data
+-1091 requires proto 91
+-1096 requires proto 96
*/
-static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
+static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
{
u64 self, peer;
int i, j;
- self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
- peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
*rule_nr = 10;
if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
@@ -2311,44 +2867,46 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
if (self == peer) {
int rct, dc; /* roles at crash time */
- if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+ if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
- return -1001;
+ if (first_peer_device(device)->connection->agreed_pro_version < 91)
+ return -1091;
- if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
- (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
- dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
- drbd_uuid_set_bm(mdev, 0UL);
+ if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+ drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
- drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
- mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
*rule_nr = 34;
} else {
- dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
+ drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
*rule_nr = 36;
}
return 1;
}
- if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
+ if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
- return -1001;
+ if (first_peer_device(device)->connection->agreed_pro_version < 91)
+ return -1091;
- if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
- (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
- dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+ if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+ drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
- mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
- mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
- mdev->p_uuid[UI_BITMAP] = 0UL;
+ device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+ device->p_uuid[UI_BITMAP] = 0UL;
- drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
*rule_nr = 35;
} else {
- dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
+ drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
*rule_nr = 37;
}
@@ -2356,8 +2914,8 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
}
/* Common power [off|failure] */
- rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
- (mdev->p_uuid[UI_FLAGS] & 2);
+ rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+ (device->p_uuid[UI_FLAGS] & 2);
/* lowest bit is set when we were primary,
* next bit (weight 2) is set when peer was primary */
*rule_nr = 40;
@@ -2367,67 +2925,72 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
case 1: /* self_pri && !peer_pri */ return 1;
case 2: /* !self_pri && peer_pri */ return -1;
case 3: /* self_pri && peer_pri */
- dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+ dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
return dc ? -1 : 1;
}
}
*rule_nr = 50;
- peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
if (self == peer)
return -1;
*rule_nr = 51;
- peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+ peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
- peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
- if (self == peer) {
+ if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+ peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of the peer's UUIDs. */
- if (mdev->agreed_pro_version < 91)
- return -1001;
+ if (first_peer_device(device)->connection->agreed_pro_version < 91)
+ return -1091;
+
+ device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+ drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
- mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
- mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
return -1;
}
}
*rule_nr = 60;
- self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
- peer = mdev->p_uuid[i] & ~((u64)1);
+ peer = device->p_uuid[i] & ~((u64)1);
if (self == peer)
return -2;
}
*rule_nr = 70;
- self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
- peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
if (self == peer)
return 1;
*rule_nr = 71;
- self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+ self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
- peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
- if (self == peer) {
+ if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+ self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of our UUIDs. */
- if (mdev->agreed_pro_version < 91)
- return -1001;
+ if (first_peer_device(device)->connection->agreed_pro_version < 91)
+ return -1091;
- _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
- _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+ __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+ __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
- dev_info(DEV, "Undid last start of resync:\n");
-
- drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
- mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+ drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
return 1;
}
@@ -2435,24 +2998,24 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
*rule_nr = 80;
- peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
- self = mdev->ldev->md.uuid[i] & ~((u64)1);
+ self = device->ldev->md.uuid[i] & ~((u64)1);
if (self == peer)
return 2;
}
*rule_nr = 90;
- self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
- peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
if (self == peer && self != ((u64)0))
return 100;
*rule_nr = 100;
for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
- self = mdev->ldev->md.uuid[i] & ~((u64)1);
+ self = device->ldev->md.uuid[i] & ~((u64)1);
for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
- peer = mdev->p_uuid[j] & ~((u64)1);
+ peer = device->p_uuid[j] & ~((u64)1);
if (self == peer)
return -100;
}
@@ -2464,32 +3027,38 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
/* drbd_sync_handshake() returns the new conn state on success, or
CONN_MASK (-1) on failure.
*/
-static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+ enum drbd_role peer_role,
enum drbd_disk_state peer_disk) __must_hold(local)
{
- int hg, rule_nr;
+ struct drbd_device *device = peer_device->device;
enum drbd_conns rv = C_MASK;
enum drbd_disk_state mydisk;
+ struct net_conf *nc;
+ int hg, rule_nr, rr_conflict, tentative;
- mydisk = mdev->state.disk;
+ mydisk = device->state.disk;
if (mydisk == D_NEGOTIATING)
- mydisk = mdev->new_state_tmp.disk;
+ mydisk = device->new_state_tmp.disk;
+
+ drbd_info(device, "drbd_sync_handshake:\n");
- dev_info(DEV, "drbd_sync_handshake:\n");
- drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
- drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
- mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+ drbd_uuid_dump(device, "peer", device->p_uuid,
+ device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
- hg = drbd_uuid_compare(mdev, &rule_nr);
+ hg = drbd_uuid_compare(device, &rule_nr);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
- dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+ drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
if (hg == -1000) {
- dev_alert(DEV, "Unrelated data, aborting!\n");
+ drbd_alert(device, "Unrelated data, aborting!\n");
return C_MASK;
}
- if (hg == -1001) {
- dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+ if (hg < -1000) {
+ drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
return C_MASK;
}
@@ -2499,35 +3068,38 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
hg = mydisk > D_INCONSISTENT ? 1 : -1;
if (f)
hg = hg*2;
- dev_info(DEV, "Becoming sync %s due to disk states.\n",
+ drbd_info(device, "Becoming sync %s due to disk states.\n",
hg > 0 ? "source" : "target");
}
if (abs(hg) == 100)
- drbd_khelper(mdev, "initial-split-brain");
+ drbd_khelper(device, "initial-split-brain");
- if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
- int pcount = (mdev->state.role == R_PRIMARY)
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+
+ if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+ int pcount = (device->state.role == R_PRIMARY)
+ (peer_role == R_PRIMARY);
int forced = (hg == -100);
switch (pcount) {
case 0:
- hg = drbd_asb_recover_0p(mdev);
+ hg = drbd_asb_recover_0p(peer_device);
break;
case 1:
- hg = drbd_asb_recover_1p(mdev);
+ hg = drbd_asb_recover_1p(peer_device);
break;
case 2:
- hg = drbd_asb_recover_2p(mdev);
+ hg = drbd_asb_recover_2p(peer_device);
break;
}
if (abs(hg) < 100) {
- dev_warn(DEV, "Split-Brain detected, %d primaries, "
+ drbd_warn(device, "Split-Brain detected, %d primaries, "
"automatically solved. Sync from %s node\n",
pcount, (hg < 0) ? "peer" : "this");
if (forced) {
- dev_warn(DEV, "Doing a full sync, since"
+ drbd_warn(device, "Doing a full sync, since"
" UUIDs where ambiguous.\n");
hg = hg*2;
}
@@ -2535,60 +3107,64 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
}
if (hg == -100) {
- if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+ if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
hg = -1;
- if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+ if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
hg = 1;
if (abs(hg) < 100)
- dev_warn(DEV, "Split-Brain detected, manually solved. "
+ drbd_warn(device, "Split-Brain detected, manually solved. "
"Sync from %s node\n",
(hg < 0) ? "peer" : "this");
}
+ rr_conflict = nc->rr_conflict;
+ tentative = nc->tentative;
+ rcu_read_unlock();
if (hg == -100) {
/* FIXME this log message is not correct if we end up here
* after an attempted attach on a diskless node.
* We just refuse to attach -- well, we drop the "connection"
* to that disk, in a way... */
- dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
- drbd_khelper(mdev, "split-brain");
+ drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+ drbd_khelper(device, "split-brain");
return C_MASK;
}
if (hg > 0 && mydisk <= D_INCONSISTENT) {
- dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
+ drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
return C_MASK;
}
if (hg < 0 && /* by intention we do not use mydisk here. */
- mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
- switch (mdev->net_conf->rr_conflict) {
+ device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+ switch (rr_conflict) {
case ASB_CALL_HELPER:
- drbd_khelper(mdev, "pri-lost");
+ drbd_khelper(device, "pri-lost");
/* fall through */
case ASB_DISCONNECT:
- dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
+ drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
return C_MASK;
case ASB_VIOLENTLY:
- dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
+ drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
"assumption\n");
}
}
- if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
+ if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
if (hg == 0)
- dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
+ drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
else
- dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
+ drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
abs(hg) >= 2 ? "full" : "bit-map based");
return C_MASK;
}
if (abs(hg) >= 2) {
- dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
- if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
+ drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+ BM_LOCKED_SET_ALLOWED))
return C_MASK;
}
@@ -2598,42 +3174,38 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
rv = C_WF_BITMAP_T;
} else {
rv = C_CONNECTED;
- if (drbd_bm_total_weight(mdev)) {
- dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
- drbd_bm_total_weight(mdev));
+ if (drbd_bm_total_weight(device)) {
+ drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+ drbd_bm_total_weight(device));
}
}
return rv;
}
-/* returns 1 if invalid */
-static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
{
/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
- if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
- (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
- return 0;
+ if (peer == ASB_DISCARD_REMOTE)
+ return ASB_DISCARD_LOCAL;
/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
- if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
- self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
- return 1;
+ if (peer == ASB_DISCARD_LOCAL)
+ return ASB_DISCARD_REMOTE;
/* everything else is valid if they are equal on both sides. */
- if (peer == self)
- return 0;
-
- /* everything es is invalid. */
- return 1;
+ return peer;
}
-static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_protocol *p = &mdev->data.rbuf.protocol;
- int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
- int p_want_lose, p_two_primaries, cf;
- char p_integrity_alg[SHARED_SECRET_MAX] = "";
+ struct p_protocol *p = pi->data;
+ enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_proto, p_discard_my_data, p_two_primaries, cf;
+ struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+ char integrity_alg[SHARED_SECRET_MAX] = "";
+ struct crypto_hash *peer_integrity_tfm = NULL;
+ void *int_dig_in = NULL, *int_dig_vv = NULL;
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
@@ -2641,63 +3213,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig
p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
p_two_primaries = be32_to_cpu(p->two_primaries);
cf = be32_to_cpu(p->conn_flags);
- p_want_lose = cf & CF_WANT_LOSE;
+ p_discard_my_data = cf & CF_DISCARD_MY_DATA;
- clear_bit(CONN_DRY_RUN, &mdev->flags);
+ if (connection->agreed_pro_version >= 87) {
+ int err;
- if (cf & CF_DRY_RUN)
- set_bit(CONN_DRY_RUN, &mdev->flags);
-
- if (p_proto != mdev->net_conf->wire_protocol) {
- dev_err(DEV, "incompatible communication protocols\n");
- goto disconnect;
+ if (pi->size > sizeof(integrity_alg))
+ return -EIO;
+ err = drbd_recv_all(connection, integrity_alg, pi->size);
+ if (err)
+ return err;
+ integrity_alg[SHARED_SECRET_MAX - 1] = 0;
}
- if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
- dev_err(DEV, "incompatible after-sb-0pri settings\n");
- goto disconnect;
- }
+ if (pi->cmd != P_PROTOCOL_UPDATE) {
+ clear_bit(CONN_DRY_RUN, &connection->flags);
- if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
- dev_err(DEV, "incompatible after-sb-1pri settings\n");
- goto disconnect;
- }
+ if (cf & CF_DRY_RUN)
+ set_bit(CONN_DRY_RUN, &connection->flags);
- if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
- dev_err(DEV, "incompatible after-sb-2pri settings\n");
- goto disconnect;
- }
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
- if (p_want_lose && mdev->net_conf->want_lose) {
- dev_err(DEV, "both sides have the 'want_lose' flag set\n");
- goto disconnect;
- }
+ if (p_proto != nc->wire_protocol) {
+ drbd_err(connection, "incompatible %s settings\n", "protocol");
+ goto disconnect_rcu_unlock;
+ }
- if (p_two_primaries != mdev->net_conf->two_primaries) {
- dev_err(DEV, "incompatible setting of the two-primaries options\n");
- goto disconnect;
+ if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_discard_my_data && nc->discard_my_data) {
+ drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_two_primaries != nc->two_primaries) {
+ drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (strcmp(integrity_alg, nc->integrity_alg)) {
+ drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+ goto disconnect_rcu_unlock;
+ }
+
+ rcu_read_unlock();
}
- if (mdev->agreed_pro_version >= 87) {
- unsigned char *my_alg = mdev->net_conf->integrity_alg;
+ if (integrity_alg[0]) {
+ int hash_size;
+
+ /*
+ * We can only change the peer data integrity algorithm
+ * here. Changing our own data integrity algorithm
+ * requires that we send a P_PROTOCOL_UPDATE packet at
+ * the same time; otherwise, the peer has no way to
+ * tell between which packets the algorithm should
+ * change.
+ */
- if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
- return FALSE;
+ peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (!peer_integrity_tfm) {
+ drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+ integrity_alg);
+ goto disconnect;
+ }
- p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
- if (strcmp(p_integrity_alg, my_alg)) {
- dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+ hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+ int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+ int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+ if (!(int_dig_in && int_dig_vv)) {
+ drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
goto disconnect;
}
- dev_info(DEV, "data-integrity-alg: %s\n",
- my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
}
- return TRUE;
+ new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(connection, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+ *new_net_conf = *old_net_conf;
+
+ new_net_conf->wire_protocol = p_proto;
+ new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+ new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+ new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+ new_net_conf->two_primaries = p_two_primaries;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ connection->peer_integrity_tfm = peer_integrity_tfm;
+ connection->int_dig_in = int_dig_in;
+ connection->int_dig_vv = int_dig_vv;
+ if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+ drbd_info(connection, "peer data-integrity-alg: %s\n",
+ integrity_alg[0] ? integrity_alg : "(none)");
+
+ synchronize_rcu();
+ kfree(old_net_conf);
+ return 0;
+
+disconnect_rcu_unlock:
+ rcu_read_unlock();
disconnect:
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
+ crypto_free_hash(peer_integrity_tfm);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* helper function
@@ -2705,7 +3352,8 @@ disconnect:
* return: NULL (alg name was "")
* ERR_PTR(error) if something goes wrong
* or the crypto hash ptr, if it worked out ok. */
-struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+static
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
const char *alg, const char *name)
{
struct crypto_hash *tfm;
@@ -2715,28 +3363,70 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm)) {
- dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+ drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
alg, name, PTR_ERR(tfm));
return tfm;
}
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- crypto_free_hash(tfm);
- dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
- return ERR_PTR(-EINVAL);
- }
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
{
- int ok = TRUE;
- struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+ void *buffer = connection->data.rbuf;
+ int size = pi->size;
+
+ while (size) {
+ int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+ s = drbd_recv(connection, buffer, s);
+ if (s <= 0) {
+ if (s < 0)
+ return s;
+ break;
+ }
+ size -= s;
+ }
+ if (size)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * config_unknown_volume - device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet. It will warn and ignore these
+ * commands. Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+ drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+ cmdname(pi->cmd), pi->vnr);
+ return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_param_95 *p;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
- const int apv = mdev->agreed_pro_version;
- int *rs_plan_s = NULL;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+ const int apv = connection->agreed_pro_version;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
int fifo_size = 0;
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
@@ -2744,66 +3434,84 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (packet_size > exp_max_sz) {
- dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- packet_size, exp_max_sz);
- return FALSE;
+ if (pi->size > exp_max_sz) {
+ drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+ pi->size, exp_max_sz);
+ return -EIO;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param);
+ data_size = pi->size - header_size;
} else if (apv <= 94) {
- header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
- D_ASSERT(data_size == 0);
+ header_size = sizeof(struct p_rs_param_89);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
} else {
- header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
- D_ASSERT(data_size == 0);
+ header_size = sizeof(struct p_rs_param_95);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
}
/* initialize verify_alg and csums_alg */
+ p = pi->data;
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
- return FALSE;
+ err = drbd_recv_all(peer_device->connection, p, header_size);
+ if (err)
+ return err;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = peer_device->connection->net_conf;
+ if (get_ldev(device)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ put_ldev(device);
+ mutex_unlock(&connection->resource->conf_update);
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ return -ENOMEM;
+ }
+
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+ }
if (apv >= 88) {
if (apv == 88) {
- if (data_size > SHARED_SECRET_MAX) {
- dev_err(DEV, "verify-alg too long, "
- "peer wants %u, accepting only %u byte\n",
- data_size, SHARED_SECRET_MAX);
- return FALSE;
+ if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+ drbd_err(device, "verify-alg of wrong size, "
+ "peer wants %u, accepting only up to %u byte\n",
+ data_size, SHARED_SECRET_MAX);
+ err = -EIO;
+ goto reconnect;
}
- if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
- return FALSE;
-
+ err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+ if (err)
+ goto reconnect;
/* we expect NUL terminated string */
/* but just in case someone tries to be evil */
- D_ASSERT(p->verify_alg[data_size-1] == 0);
+ D_ASSERT(device, p->verify_alg[data_size-1] == 0);
p->verify_alg[data_size-1] = 0;
} else /* apv >= 89 */ {
/* we still expect NUL terminated strings */
/* but just in case someone tries to be evil */
- D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
- D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+ D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+ D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
p->verify_alg[SHARED_SECRET_MAX-1] = 0;
p->csums_alg[SHARED_SECRET_MAX-1] = 0;
}
- if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
- if (mdev->state.conn == C_WF_REPORT_PARAMS) {
- dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.verify_alg, p->verify_alg);
+ if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->verify_alg, p->verify_alg);
goto disconnect;
}
- verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+ verify_tfm = drbd_crypto_alloc_digest_safe(device,
p->verify_alg, "verify-alg");
if (IS_ERR(verify_tfm)) {
verify_tfm = NULL;
@@ -2811,13 +3519,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
}
}
- if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
- if (mdev->state.conn == C_WF_REPORT_PARAMS) {
- dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.csums_alg, p->csums_alg);
+ if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->csums_alg, p->csums_alg);
goto disconnect;
}
- csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+ csums_tfm = drbd_crypto_alloc_digest_safe(device,
p->csums_alg, "csums-alg");
if (IS_ERR(csums_tfm)) {
csums_tfm = NULL;
@@ -2825,67 +3533,95 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
}
}
- if (apv > 94) {
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
- mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
- mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
- mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
- mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
-
- fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
- if (!rs_plan_s) {
- dev_err(DEV, "kmalloc of fifo_buffer failed");
+ if (apv > 94 && new_disk_conf) {
+ new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+ new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+ new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ put_ldev(device);
goto disconnect;
}
}
}
- spin_lock(&mdev->peer_seq_lock);
- /* lock against drbd_nl_syncer_conf() */
- if (verify_tfm) {
- strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
- mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
- }
- if (csums_tfm) {
- strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
- mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
- }
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
+ if (verify_tfm || csums_tfm) {
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(device, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ *new_net_conf = *old_net_conf;
+
+ if (verify_tfm) {
+ strcpy(new_net_conf->verify_alg, p->verify_alg);
+ new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(peer_device->connection->verify_tfm);
+ peer_device->connection->verify_tfm = verify_tfm;
+ drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(new_net_conf->csums_alg, p->csums_alg);
+ new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(peer_device->connection->csums_tfm);
+ peer_device->connection->csums_tfm = csums_tfm;
+ drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
}
- spin_unlock(&mdev->peer_seq_lock);
}
- return ok;
+ if (new_disk_conf) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ put_ldev(device);
+ }
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ if (new_net_conf)
+ kfree(old_net_conf);
+ kfree(old_disk_conf);
+ kfree(old_plan);
+
+ return 0;
+
+reconnect:
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
+ return -EIO;
+
disconnect:
+ kfree(new_plan);
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
/* just for completeness: actually not needed,
* as this is not reached if csums_tfm was ok. */
crypto_free_hash(csums_tfm);
/* but free the verify_tfm again, if csums_tfm did not work out */
crypto_free_hash(verify_tfm);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
-}
-
-static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
-{
- /* sorry, we currently have no working implementation
- * of distributed TCQ */
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* warn if the arguments differ by more than 12.5% */
-static void warn_if_differ_considerably(struct drbd_conf *mdev,
+static void warn_if_differ_considerably(struct drbd_device *device,
const char *s, sector_t a, sector_t b)
{
sector_t d;
@@ -2893,180 +3629,211 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
return;
d = (a > b) ? (a - b) : (b - a);
if (d > (a>>3) || d > (b>>3))
- dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
+ drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_sizes *p = &mdev->data.rbuf.sizes;
- enum determine_dev_size dd = unchanged;
- unsigned int max_seg_s;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_sizes *p = pi->data;
+ enum determine_dev_size dd = DS_UNCHANGED;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
- if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
- dev_err(DEV, "some backing storage is needed\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
- }
-
/* just store the peer's disk size for now.
* we still need to figure out whether we accept that. */
- mdev->p_size = p_size;
+ device->p_size = p_size;
+
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
- if (get_ldev(mdev)) {
- warn_if_differ_considerably(mdev, "lower level device sizes",
- p_size, drbd_get_max_capacity(mdev->ldev));
- warn_if_differ_considerably(mdev, "user requested size",
- p_usize, mdev->ldev->dc.disk_size);
+ warn_if_differ_considerably(device, "lower level device sizes",
+ p_size, drbd_get_max_capacity(device->ldev));
+ warn_if_differ_considerably(device, "user requested size",
+ p_usize, my_usize);
/* if this is the first connect, or an otherwise expected
* param exchange, choose the minimum */
- if (mdev->state.conn == C_WF_REPORT_PARAMS)
- p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
- p_usize);
-
- my_usize = mdev->ldev->dc.disk_size;
-
- if (mdev->ldev->dc.disk_size != p_usize) {
- mdev->ldev->dc.disk_size = p_usize;
- dev_info(DEV, "Peer sets u_size to %lu sectors\n",
- (unsigned long)mdev->ldev->dc.disk_size);
- }
+ if (device->state.conn == C_WF_REPORT_PARAMS)
+ p_usize = min_not_zero(my_usize, p_usize);
/* Never shrink a device with usable data during connect.
But allow online shrinking if we are connected. */
- if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
- drbd_get_capacity(mdev->this_bdev) &&
- mdev->state.disk >= D_OUTDATED &&
- mdev->state.conn < C_CONNECTED) {
- dev_err(DEV, "The peer's disk size is too small!\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- mdev->ldev->dc.disk_size = my_usize;
- put_ldev(mdev);
- return FALSE;
+ if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
+ drbd_get_capacity(device->this_bdev) &&
+ device->state.disk >= D_OUTDATED &&
+ device->state.conn < C_CONNECTED) {
+ drbd_err(device, "The peer's disk size is too small!\n");
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ put_ldev(device);
+ return -EIO;
}
- put_ldev(mdev);
+
+ if (my_usize != p_usize) {
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&connection->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = p_usize;
+
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+
+ drbd_info(device, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)my_usize);
+ }
+
+ put_ldev(device);
}
-#undef min_not_zero
+
+ device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+ drbd_reconsider_max_bio_size(device);
+ /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+ In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+ drbd_reconsider_max_bio_size(), we can be sure that after
+ drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
ddsf = be16_to_cpu(p->dds_flags);
- if (get_ldev(mdev)) {
- dd = drbd_determin_dev_size(mdev, ddsf);
- put_ldev(mdev);
- if (dd == dev_size_error)
- return FALSE;
- drbd_md_sync(mdev);
+ if (get_ldev(device)) {
+ dd = drbd_determine_dev_size(device, ddsf, NULL);
+ put_ldev(device);
+ if (dd == DS_ERROR)
+ return -EIO;
+ drbd_md_sync(device);
} else {
/* I am diskless, need to accept the peer's size. */
- drbd_set_my_capacity(mdev, p_size);
+ drbd_set_my_capacity(device, p_size);
}
- if (get_ldev(mdev)) {
- if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
- mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+ if (get_ldev(device)) {
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
ldsc = 1;
}
- if (mdev->agreed_pro_version < 94)
- max_seg_s = be32_to_cpu(p->max_segment_size);
- else if (mdev->agreed_pro_version == 94)
- max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
- else /* drbd 8.3.8 onwards */
- max_seg_s = DRBD_MAX_SEGMENT_SIZE;
-
- if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
- drbd_setup_queue_param(mdev, max_seg_s);
-
- drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
- put_ldev(mdev);
+ put_ldev(device);
}
- if (mdev->state.conn > C_WF_REPORT_PARAMS) {
+ if (device->state.conn > C_WF_REPORT_PARAMS) {
if (be64_to_cpu(p->c_size) !=
- drbd_get_capacity(mdev->this_bdev) || ldsc) {
+ drbd_get_capacity(device->this_bdev) || ldsc) {
/* we have different sizes, probably peer
* needs to know my new size... */
- drbd_send_sizes(mdev, 0, ddsf);
+ drbd_send_sizes(peer_device, 0, ddsf);
}
- if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
- (dd == grew && mdev->state.conn == C_CONNECTED)) {
- if (mdev->state.pdsk >= D_INCONSISTENT &&
- mdev->state.disk >= D_INCONSISTENT) {
+ if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+ (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+ if (device->state.pdsk >= D_INCONSISTENT &&
+ device->state.disk >= D_INCONSISTENT) {
if (ddsf & DDSF_NO_RESYNC)
- dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
+ drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
else
- resync_after_online_grow(mdev);
+ resync_after_online_grow(device);
} else
- set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
}
}
- return TRUE;
+ return 0;
}
-static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_uuids *p = &mdev->data.rbuf.uuids;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_uuids *p = pi->data;
u64 *p_uuid;
- int i;
+ int i, updated_uuids = 0;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+ if (!p_uuid) {
+ drbd_err(device, "kmalloc of p_uuid failed\n");
+ return false;
+ }
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
p_uuid[i] = be64_to_cpu(p->uuid[i]);
- kfree(mdev->p_uuid);
- mdev->p_uuid = p_uuid;
+ kfree(device->p_uuid);
+ device->p_uuid = p_uuid;
- if (mdev->state.conn < C_CONNECTED &&
- mdev->state.disk < D_INCONSISTENT &&
- mdev->state.role == R_PRIMARY &&
- (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
- dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
- (unsigned long long)mdev->ed_uuid);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
+ if (device->state.conn < C_CONNECTED &&
+ device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY &&
+ (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
- if (get_ldev(mdev)) {
+ if (get_ldev(device)) {
int skip_initial_sync =
- mdev->state.conn == C_CONNECTED &&
- mdev->agreed_pro_version >= 90 &&
- mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+ device->state.conn == C_CONNECTED &&
+ peer_device->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
(p_uuid[UI_FLAGS] & 8);
if (skip_initial_sync) {
- dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
- drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
- "clear_n_write from receive_uuids");
- _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
- _drbd_uuid_set(mdev, UI_BITMAP, 0);
- _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+ drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from receive_uuids",
+ BM_LOCKED_TEST_ALLOWED);
+ _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
CS_VERBOSE, NULL);
- drbd_md_sync(mdev);
+ drbd_md_sync(device);
+ updated_uuids = 1;
}
- put_ldev(mdev);
- } else if (mdev->state.disk < D_INCONSISTENT &&
- mdev->state.role == R_PRIMARY) {
+ put_ldev(device);
+ } else if (device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY) {
/* I am a diskless primary, the peer just created a new current UUID
for me. */
- drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+ updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
}
/* Before we test for the disk state, we should wait until an eventually
ongoing cluster wide state change is finished. That is important if
we are primary and are detaching from our disk. We need to see the
new disk state... */
- wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
- if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
- drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+ mutex_lock(device->state_mutex);
+ mutex_unlock(device->state_mutex);
+ if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+ updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
- return TRUE;
+ if (updated_uuids)
+ drbd_print_uuids(device, "receiver updated UUIDs to");
+
+ return 0;
}
/**
@@ -3078,6 +3845,7 @@ static union drbd_state convert_state(union drbd_state ps)
union drbd_state ms;
static enum drbd_conns c_tab[] = {
+ [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
[C_CONNECTED] = C_CONNECTED,
[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
@@ -3099,56 +3867,105 @@ static union drbd_state convert_state(union drbd_state ps)
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_req_state *p = &mdev->data.rbuf.req_state;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state *p = pi->data;
union drbd_state mask, val;
- int rv;
+ enum drbd_state_rv rv;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
- if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
- test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
- drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
- return TRUE;
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+ mutex_is_locked(device->state_mutex)) {
+ drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+ return 0;
}
mask = convert_state(mask);
val = convert_state(val);
- rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+ rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+ drbd_send_sr_reply(peer_device, rv);
+
+ drbd_md_sync(device);
+
+ return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
- drbd_send_sr_reply(mdev, rv);
- drbd_md_sync(mdev);
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
- return TRUE;
+ if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+ mutex_is_locked(&connection->cstate_mutex)) {
+ conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+ conn_send_sr_reply(connection, rv);
+
+ return 0;
}
-static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_state *p = &mdev->data.rbuf.state;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_state *p = pi->data;
union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
enum chg_state_flags cs_flags;
int rv;
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
if (peer_state.disk == D_NEGOTIATING) {
- real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
- dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+ real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+ drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
}
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
retry:
- os = ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
-
- /* peer says his disk is uptodate, while we think it is inconsistent,
- * and this happens while we think we have a sync going on. */
- if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ os = ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ /* If some other part of the code (asender thread, timeout)
+ * already decided to close the connection again,
+ * we must not "re-establish" it here. */
+ if (os.conn <= C_TEAR_DOWN)
+ return -ECONNRESET;
+
+ /* If this is the "end of sync" confirmation, usually the peer disk
+ * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+ * set) resync started in PausedSyncT, or if the timing of pause-/
+ * unpause-sync events has been "just right", the peer disk may
+ * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+ */
+ if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+ real_peer_disk == D_UP_TO_DATE &&
os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
/* If we are (becoming) SyncSource, but peer is still in sync
* preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3165,12 +3982,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
* Maybe we should finish it up, too? */
else if (os.conn >= C_SYNC_SOURCE &&
peer_state.conn == C_CONNECTED) {
- if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
- drbd_resync_finished(mdev);
- return TRUE;
+ if (drbd_bm_total_weight(device) <= device->rs_failed)
+ drbd_resync_finished(device);
+ return 0;
}
}
+ /* explicit verify finished notification, stop sector reached. */
+ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+ peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ return 0;
+ }
+
/* peer says his disk is inconsistent, while we think it is uptodate,
* and this happens while the peer still thinks we have a sync going on,
* but we think we are already done with the sync.
@@ -3183,8 +4008,11 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
if (ns.conn == C_WF_REPORT_PARAMS)
ns.conn = C_CONNECTED;
- if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
- get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ if (peer_state.conn == C_AHEAD)
+ ns.conn = C_BEHIND;
+
+ if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
int cr; /* consider resync */
/* if we established a new connection */
@@ -3196,7 +4024,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
- cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
+ cr |= test_bit(CONSIDER_RESYNC, &device->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
cr |= (os.conn == C_CONNECTED &&
@@ -3204,56 +4032,56 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
peer_state.conn <= C_WF_BITMAP_T));
if (cr)
- ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+ ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
- put_ldev(mdev);
+ put_ldev(device);
if (ns.conn == C_MASK) {
ns.conn = C_CONNECTED;
- if (mdev->state.disk == D_NEGOTIATING) {
- drbd_force_state(mdev, NS(disk, D_FAILED));
+ if (device->state.disk == D_NEGOTIATING) {
+ drbd_force_state(device, NS(disk, D_FAILED));
} else if (peer_state.disk == D_NEGOTIATING) {
- dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
+ drbd_err(device, "Disk attach process on the peer node was aborted.\n");
peer_state.disk = D_DISKLESS;
real_peer_disk = D_DISKLESS;
} else {
- if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
- return FALSE;
- D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
+ if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+ return -EIO;
+ D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
}
}
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.i != os.i)
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.i != drbd_read_state(device).i)
goto retry;
- clear_bit(CONSIDER_RESYNC, &mdev->flags);
+ clear_bit(CONSIDER_RESYNC, &device->flags);
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
- ns.disk = mdev->new_state_tmp.disk;
+ ns.disk = device->new_state_tmp.disk;
cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
- if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
- test_bit(NEW_CUR_UUID, &mdev->flags)) {
- /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &device->flags)) {
+ /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
for temporal network outages! */
- spin_unlock_irq(&mdev->req_lock);
- dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
- tl_clear(mdev);
- drbd_uuid_new_current(mdev);
- clear_bit(NEW_CUR_UUID, &mdev->flags);
- drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
- return FALSE;
- }
- rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+ drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(peer_device->connection);
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+ return -EIO;
+ }
+ rv = _drbd_set_state(device, ns, cs_flags, NULL);
+ ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
if (rv < SS_SUCCESS) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3262,76 +4090,116 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
/* we want resync, peer has not yet decided to sync... */
/* Nowadays only used when forcing a node into primary role and
setting its disk to UpToDate with that */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev);
+ drbd_send_uuids(peer_device);
+ drbd_send_current_state(peer_device);
}
}
- mdev->net_conf->want_lose = 0;
+ clear_bit(DISCARD_MY_DATA, &device->flags);
- drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+ drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
- return TRUE;
+ return 0;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_uuid *p = pi->data;
- wait_event(mdev->misc_wait,
- mdev->state.conn == C_WF_SYNC_UUID ||
- mdev->state.conn < C_CONNECTED ||
- mdev->state.disk < D_NEGOTIATING);
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
- /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
+ wait_event(device->misc_wait,
+ device->state.conn == C_WF_SYNC_UUID ||
+ device->state.conn == C_BEHIND ||
+ device->state.conn < C_CONNECTED ||
+ device->state.disk < D_NEGOTIATING);
+
+ /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
/* Here the _drbd_uuid_ functions are right, current should
_not_ be rotated into the history */
- if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
- _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
- _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+ _drbd_uuid_set(device, UI_BITMAP, 0UL);
- drbd_start_resync(mdev, C_SYNC_TARGET);
+ drbd_print_uuids(device, "updated sync uuid");
+ drbd_start_resync(device, C_SYNC_TARGET);
- put_ldev(mdev);
+ put_ldev(device);
} else
- dev_err(DEV, "Ignoring SyncUUID packet!\n");
+ drbd_err(device, "Ignoring SyncUUID packet!\n");
- return TRUE;
+ return 0;
}
-enum receive_bitmap_ret { OK, DONE, FAILED };
-
-static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+ unsigned long *p, struct bm_xfer_ctx *c)
{
- unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- unsigned want = num_words * sizeof(long);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+ drbd_header_size(peer_device->connection);
+ unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ unsigned int want = num_words * sizeof(*p);
+ int err;
- if (want != data_size) {
- dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
- return FAILED;
+ if (want != size) {
+ drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+ return -EIO;
}
if (want == 0)
- return DONE;
- if (drbd_recv(mdev, buffer, want) != want)
- return FAILED;
+ return 0;
+ err = drbd_recv_all(peer_device->connection, p, want);
+ if (err)
+ return err;
- drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+ drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
- return OK;
+ return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
}
-static enum receive_bitmap_ret
-recv_bm_rle_bits(struct drbd_conf *mdev,
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
struct bitstream bs;
u64 look_ahead;
@@ -3339,44 +4207,47 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
- int toggle = DCBP_get_start(p);
+ int toggle = dcbp_get_start(p);
int have;
int bits;
- bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+ bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
bits = bitstream_get_bits(&bs, &look_ahead, 64);
if (bits < 0)
- return FAILED;
+ return -EIO;
for (have = bits; have > 0; s += rl, toggle = !toggle) {
bits = vli_decode_bits(&rl, look_ahead);
if (bits <= 0)
- return FAILED;
+ return -EIO;
if (toggle) {
e = s + rl -1;
if (e >= c->bm_bits) {
- dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
- return FAILED;
+ drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+ return -EIO;
}
- _drbd_bm_set_bits(mdev, s, e);
+ _drbd_bm_set_bits(peer_device->device, s, e);
}
if (have < bits) {
- dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+ drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
have, bits, look_ahead,
(unsigned int)(bs.cur.b - p->code),
(unsigned int)bs.buf_len);
- return FAILED;
+ return -EIO;
}
- look_ahead >>= bits;
+ /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+ if (likely(bits < 64))
+ look_ahead >>= bits;
+ else
+ look_ahead = 0;
have -= bits;
bits = bitstream_get_bits(&bs, &tmp, 64 - have);
if (bits < 0)
- return FAILED;
+ return -EIO;
look_ahead |= tmp << have;
have += bits;
}
@@ -3384,35 +4255,44 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
c->bit_offset = s;
bm_xfer_ctx_bit_to_word_offset(c);
- return (s == c->bm_bits) ? DONE : OK;
+ return (s != c->bm_bits);
}
-static enum receive_bitmap_ret
-decode_bitmap_c(struct drbd_conf *mdev,
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
- if (DCBP_get_code(p) == RLE_VLI_Bits)
- return recv_bm_rle_bits(mdev, p, c);
+ if (dcbp_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
/* other variants had been implemented for evaluation,
* but have been dropped as this one turned out to be "best"
* during all our tests. */
- dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- return FAILED;
+ drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+ conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ return -EIO;
}
-void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+void INFO_bm_xfer_stats(struct drbd_device *device,
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header80) *
- ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
- + c->bm_words * sizeof(long);
- unsigned total = c->bytes[0] + c->bytes[1];
- unsigned r;
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ unsigned int plain =
+ header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+ c->bm_words * sizeof(unsigned long);
+ unsigned int total = c->bytes[0] + c->bytes[1];
+ unsigned int r;
/* total can not be zero. but just in case: */
if (total == 0)
@@ -3430,7 +4310,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
r = 1000;
r = 1000 - r;
- dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+ drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
"total %u; compression: %u.%u%%\n",
direction,
c->bytes[1], c->packets[1],
@@ -3446,132 +4326,141 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
struct bm_xfer_ctx c;
- void *buffer;
- enum receive_bitmap_ret ret;
- int ok = FALSE;
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
+ int err;
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
- drbd_bm_lock(mdev, "receive bitmap");
-
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- buffer = (unsigned long *) __get_free_page(GFP_NOIO);
- if (!buffer) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
- goto out;
- }
+ drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+ /* you are supposed to send additional out-of-sync information
+ * if you actually set bits during this phase */
c = (struct bm_xfer_ctx) {
- .bm_bits = drbd_bm_bits(mdev),
- .bm_words = drbd_bm_words(mdev),
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
};
- do {
- if (cmd == P_BITMAP) {
- ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
- } else if (cmd == P_COMPRESSED_BITMAP) {
+ for(;;) {
+ if (pi->cmd == P_BITMAP)
+ err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+ else if (pi->cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
- struct p_compressed_bm *p;
+ struct p_compressed_bm *p = pi->data;
- if (data_size > BM_PACKET_PAYLOAD_BYTES) {
- dev_err(DEV, "ReportCBitmap packet too large\n");
+ if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+ drbd_err(device, "ReportCBitmap packet too large\n");
+ err = -EIO;
goto out;
}
- /* use the page buff */
- p = buffer;
- memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
+ if (pi->size <= sizeof(*p)) {
+ drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+ err = -EIO;
goto out;
- if (data_size <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
- return FAILED;
}
- ret = decode_bitmap_c(mdev, p, &c);
+ err = drbd_recv_all(peer_device->connection, p, pi->size);
+ if (err)
+ goto out;
+ err = decode_bitmap_c(peer_device, p, &c, pi->size);
} else {
- dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+ drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+ err = -EIO;
goto out;
}
- c.packets[cmd == P_BITMAP]++;
- c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+ c.packets[pi->cmd == P_BITMAP]++;
+ c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
- if (ret != OK)
+ if (err <= 0) {
+ if (err < 0)
+ goto out;
break;
-
- if (!drbd_recv_header(mdev, &cmd, &data_size))
+ }
+ err = drbd_recv_header(peer_device->connection, pi);
+ if (err)
goto out;
- } while (ret == OK);
- if (ret == FAILED)
- goto out;
+ }
+
+ INFO_bm_xfer_stats(device, "receive", &c);
- INFO_bm_xfer_stats(mdev, "receive", &c);
+ if (device->state.conn == C_WF_BITMAP_T) {
+ enum drbd_state_rv rv;
- if (mdev->state.conn == C_WF_BITMAP_T) {
- ok = !drbd_send_bitmap(mdev);
- if (!ok)
+ err = drbd_send_bitmap(device);
+ if (err)
goto out;
/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
- ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
- D_ASSERT(ok == SS_SUCCESS);
- } else if (mdev->state.conn != C_WF_BITMAP_S) {
+ rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ D_ASSERT(device, rv == SS_SUCCESS);
+ } else if (device->state.conn != C_WF_BITMAP_S) {
/* admin may have requested C_DISCONNECTING,
* other threads may have noticed network errors */
- dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
- drbd_conn_str(mdev->state.conn));
+ drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+ drbd_conn_str(device->state.conn));
}
+ err = 0;
- ok = TRUE;
out:
- drbd_bm_unlock(mdev);
- if (ok && mdev->state.conn == C_WF_BITMAP_S)
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- free_page((unsigned long) buffer);
- return ok;
+ drbd_bm_unlock(device);
+ if (!err && device->state.conn == C_WF_BITMAP_S)
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ return err;
}
-static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
{
- /* TODO zero copy sink :) */
- static char sink[128];
- int size, want, r;
+ drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+ pi->cmd, pi->size);
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- cmd, data_size);
-
- size = data_size;
- while (size > 0) {
- want = min_t(int, size, sizeof(sink));
- r = drbd_recv(mdev, sink, want);
- ERR_IF(r <= 0) break;
- size -= r;
- }
- return size == 0;
+ return ignore_remaining_packet(connection, pi);
}
-static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
{
- if (mdev->state.disk >= D_INCONSISTENT)
- drbd_kick_lo(mdev);
-
/* Make sure we've acked all the TCP data associated
* with the data requests being unplugged */
- drbd_tcp_quickack(mdev->data.socket);
+ drbd_tcp_quickack(connection->data.socket);
- return TRUE;
+ return 0;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_desc *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ switch (device->state.conn) {
+ case C_WF_SYNC_UUID:
+ case C_WF_BITMAP_T:
+ case C_BEHIND:
+ break;
+ default:
+ drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+ drbd_conn_str(device->state.conn));
+ }
+
+ drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+ return 0;
+}
struct data_cmd {
int expect_payload;
size_t pkt_size;
- drbd_cmd_handler_f function;
+ int (*fn)(struct drbd_connection *, struct packet_info *);
};
static struct data_cmd drbd_cmd_handler[] = {
@@ -3579,13 +4468,13 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
[P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
[P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
- [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
[P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
- [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
- [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
[P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
[P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
@@ -3596,131 +4485,123 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
- /* anything missing from this table is in
- * the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = { 0, 0, NULL },
+ [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+ [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+ [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
};
-/* All handler functions that expect a sub-header get that sub-heder in
- mdev->data.rbuf.header.head.payload.
-
- Usually in mdev->data.rbuf.header.head the callback can find the usual
- p_header, but they may not rely on that. Since there is also p_header95 !
- */
-
-static void drbdd(struct drbd_conf *mdev)
+static void drbdd(struct drbd_connection *connection)
{
- union p_header *header = &mdev->data.rbuf.header;
- unsigned int packet_size;
- enum drbd_packets cmd;
+ struct packet_info pi;
size_t shs; /* sub header size */
- int rv;
+ int err;
+
+ while (get_t_state(&connection->receiver) == RUNNING) {
+ struct data_cmd *cmd;
- while (get_t_state(&mdev->receiver) == Running) {
- drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ drbd_thread_current_set_cpu(&connection->receiver);
+ if (drbd_recv_header(connection, &pi))
goto err_out;
- if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ cmd = &drbd_cmd_handler[pi.cmd];
+ if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+ drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+ cmdname(pi.cmd), pi.cmd);
goto err_out;
}
- shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
- if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
- dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ shs = cmd->pkt_size;
+ if (pi.size > shs && !cmd->expect_payload) {
+ drbd_err(connection, "No payload expected %s l:%d\n",
+ cmdname(pi.cmd), pi.size);
goto err_out;
}
if (shs) {
- rv = drbd_recv(mdev, &header->h80.payload, shs);
- if (unlikely(rv != shs)) {
- dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+ err = drbd_recv_all_warn(connection, pi.data, shs);
+ if (err)
goto err_out;
- }
+ pi.size -= shs;
}
- rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
-
- if (unlikely(!rv)) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(cmd), packet_size);
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+ cmdname(pi.cmd), err, pi.size);
goto err_out;
}
}
+ return;
- if (0) {
- err_out:
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
- /* If we leave here, we probably want to update at least the
- * "Connected" indicator on stable storage. Do so explicitly here. */
- drbd_md_sync(mdev);
+ err_out:
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
-void drbd_flush_workqueue(struct drbd_conf *mdev)
+static void conn_disconnect(struct drbd_connection *connection)
{
- struct drbd_wq_barrier barr;
+ struct drbd_peer_device *peer_device;
+ enum drbd_conns oc;
+ int vnr;
- barr.w.cb = w_prev_work_done;
- init_completion(&barr.done);
- drbd_queue_work(&mdev->data.work, &barr.w);
- wait_for_completion(&barr.done);
-}
+ if (connection->cstate == C_STANDALONE)
+ return;
-void drbd_free_tl_hash(struct drbd_conf *mdev)
-{
- struct hlist_head *h;
+ /* We are about to start the cleanup after connection loss.
+ * Make sure drbd_make_request knows about that.
+ * Usually we should be in some network failure state already,
+ * but just in case we are not, we fix it up here.
+ */
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+ /* asender does not clean up anything. it must not interfere, either */
+ drbd_thread_stop(&connection->asender);
+ drbd_free_sock(connection);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_disconnected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&connection->current_epoch->list))
+ drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&connection->current_epoch->epoch_size, 0);
+ connection->send.seen_any_write_yet = false;
- spin_lock_irq(&mdev->req_lock);
+ drbd_info(connection, "Connection closed\n");
- if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
- spin_unlock_irq(&mdev->req_lock);
- return;
- }
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* paranoia code */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- (int)(h - mdev->tl_hash), h->first);
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
+ if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+ conn_try_outdate_peer_async(connection);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ oc = connection->cstate;
+ if (oc >= C_UNCONNECTED)
+ _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (oc == C_DISCONNECTING)
+ conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
}
-static void drbd_disconnect(struct drbd_conf *mdev)
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
{
- enum drbd_fencing_p fp;
- union drbd_state os, ns;
- int rv = SS_UNKNOWN_ERROR;
+ struct drbd_device *device = peer_device->device;
unsigned int i;
- if (mdev->state.conn == C_STANDALONE)
- return;
- if (mdev->state.conn >= C_WF_CONNECTION)
- dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
- drbd_conn_str(mdev->state.conn));
-
- /* asender does not clean up anything. it must not interfere, either */
- drbd_thread_stop(&mdev->asender);
- drbd_free_sock(mdev);
-
/* wait for current activity to cease. */
- spin_lock_irq(&mdev->req_lock);
- _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
- _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, &device->active_ee);
+ _drbd_wait_ee_list_empty(device, &device->sync_ee);
+ _drbd_wait_ee_list_empty(device, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
/* We do not have data structures that would allow us to
* get the rs_pending_cnt down to 0 again.
@@ -3732,71 +4613,42 @@ static void drbd_disconnect(struct drbd_conf *mdev)
* resync_LRU. The resync_LRU tracks the whole operation including
* the disk-IO, while the rs_pending_cnt only tracks the blocks
* on the fly. */
- drbd_rs_cancel_all(mdev);
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
- wake_up(&mdev->misc_wait);
+ drbd_rs_cancel_all(device);
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ wake_up(&device->misc_wait);
- /* make sure syncer is stopped and w_resume_next_sg queued */
- del_timer_sync(&mdev->resync_timer);
- resync_timer_fn((unsigned long)mdev);
+ del_timer_sync(&device->resync_timer);
+ resync_timer_fn((unsigned long)device);
/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
* w_make_resync_request etc. which may still be on the worker queue
* to be "canceled" */
- drbd_flush_workqueue(mdev);
-
- /* This also does reclaim_net_ee(). If we do this too early, we might
- * miss some resync ee and pages.*/
- drbd_process_done_ee(mdev);
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
- kfree(mdev->p_uuid);
- mdev->p_uuid = NULL;
+ drbd_finish_peer_reqs(device);
- if (!is_susp(mdev->state))
- tl_clear(mdev);
+ /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+ might have issued a work again. The one before drbd_finish_peer_reqs() is
+ necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
- dev_info(DEV, "Connection closed\n");
+ /* need to do it again, drbd_finish_peer_reqs() may have populated it
+ * again via drbd_try_clear_on_disk_bm(). */
+ drbd_rs_cancel_all(device);
- drbd_md_sync(mdev);
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
- drbd_try_outdate_peer_async(mdev);
+ if (!drbd_suspended(device))
+ tl_clear(peer_device->connection);
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- if (os.conn >= C_UNCONNECTED) {
- /* Do not restart in case we are C_DISCONNECTING */
- ns = os;
- ns.conn = C_UNCONNECTED;
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- }
- spin_unlock_irq(&mdev->req_lock);
+ drbd_md_sync(device);
- if (os.conn == C_DISCONNECTING) {
- wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
-
- if (!is_susp(mdev->state)) {
- /* we must not free the tl_hash
- * while application io is still on the fly */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
- drbd_free_tl_hash(mdev);
- }
-
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
-
- kfree(mdev->net_conf);
- mdev->net_conf = NULL;
- drbd_request_state(mdev, NS(conn, C_STANDALONE));
- }
+ /* serialize with bitmap writeout triggered by the state change,
+ * if any. */
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
@@ -3805,24 +4657,22 @@ static void drbd_disconnect(struct drbd_conf *mdev)
* Actually we don't care for exactly when the network stack does its
* put_page(), but release our reference on these pages right here.
*/
- i = drbd_release_ee(mdev, &mdev->net_ee);
+ i = drbd_free_peer_reqs(device, &device->net_ee);
if (i)
- dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
- i = atomic_read(&mdev->pp_in_use_by_net);
+ drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&device->pp_in_use_by_net);
if (i)
- dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
- i = atomic_read(&mdev->pp_in_use);
+ drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+ i = atomic_read(&device->pp_in_use);
if (i)
- dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
+ drbd_info(device, "pp_in_use = %d, expected 0\n", i);
- D_ASSERT(list_empty(&mdev->read_ee));
- D_ASSERT(list_empty(&mdev->active_ee));
- D_ASSERT(list_empty(&mdev->sync_ee));
- D_ASSERT(list_empty(&mdev->done_ee));
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
- /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
- atomic_set(&mdev->current_epoch->epoch_size, 0);
- D_ASSERT(list_empty(&mdev->current_epoch->list));
+ return 0;
}
/*
@@ -3834,29 +4684,20 @@ static void drbd_disconnect(struct drbd_conf *mdev)
*
* for now, they are expected to be zero, but ignored.
*/
-static int drbd_send_handshake(struct drbd_conf *mdev)
+static int drbd_send_features(struct drbd_connection *connection)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.sbuf.handshake;
- int ok;
-
- if (mutex_lock_interruptible(&mdev->data.mutex)) {
- dev_err(DEV, "interrupted during initial handshake\n");
- return 0; /* interrupted. not ok. */
- }
-
- if (mdev->data.socket == NULL) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
+ struct drbd_socket *sock;
+ struct p_connection_features *p;
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
memset(p, 0, sizeof(*p));
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
- ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header80 *)p, sizeof(*p), 0 );
- mutex_unlock(&mdev->data.mutex);
- return ok;
+ p->feature_flags = cpu_to_be32(PRO_FEATURES);
+ return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
}
/*
@@ -3866,41 +4707,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
* -1 peer talks different language,
* no point in trying again, please go standalone.
*/
-static int drbd_do_handshake(struct drbd_conf *mdev)
+static int drbd_do_features(struct drbd_connection *connection)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
- unsigned int length;
- enum drbd_packets cmd;
- int rv;
+ /* ASSERT current == connection->receiver ... */
+ struct p_connection_features *p;
+ const int expect = sizeof(struct p_connection_features);
+ struct packet_info pi;
+ int err;
- rv = drbd_send_handshake(mdev);
- if (!rv)
+ err = drbd_send_features(connection);
+ if (err)
return 0;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
+ err = drbd_recv_header(connection, &pi);
+ if (err)
return 0;
- if (cmd != P_HAND_SHAKE) {
- dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ if (pi.cmd != P_CONNECTION_FEATURES) {
+ drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
return -1;
}
- if (length != expect) {
- dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, length);
+ if (pi.size != expect) {
+ drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+ expect, pi.size);
return -1;
}
- rv = drbd_recv(mdev, &p->head.payload, expect);
-
- if (rv != expect) {
- dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
+ p = pi.data;
+ err = drbd_recv_all_warn(connection, p, expect);
+ if (err)
return 0;
- }
p->protocol_min = be32_to_cpu(p->protocol_min);
p->protocol_max = be32_to_cpu(p->protocol_max);
@@ -3911,15 +4749,19 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
PRO_VERSION_MIN > p->protocol_max)
goto incompat;
- mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+ drbd_info(connection, "Handshake successful: "
+ "Agreed network protocol version %d\n", connection->agreed_pro_version);
- dev_info(DEV, "Handshake successful: "
- "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+ drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+ connection->agreed_features & FF_TRIM ? " " : " not ");
return 1;
incompat:
- dev_err(DEV, "incompatible DRBD dialects: "
+ drbd_err(connection, "incompatible DRBD dialects: "
"I support %d-%d, peer supports %d-%d\n",
PRO_VERSION_MIN, PRO_VERSION_MAX,
p->protocol_min, p->protocol_max);
@@ -3927,10 +4769,10 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
}
#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_connection *connection)
{
- dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
- dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+ drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+ drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
return -1;
}
#else
@@ -3942,119 +4784,151 @@ static int drbd_do_auth(struct drbd_conf *mdev)
-1 - auth failed, don't try again.
*/
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_connection *connection)
{
+ struct drbd_socket *sock;
char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
struct scatterlist sg;
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
- unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+ unsigned int key_len;
+ char secret[SHARED_SECRET_MAX]; /* 64 byte */
unsigned int resp_size;
struct hash_desc desc;
- enum drbd_packets cmd;
- unsigned int length;
- int rv;
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err, rv;
+
+ /* FIXME: Put the challenge/response into the preallocated socket buffer. */
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ key_len = strlen(nc->shared_secret);
+ memcpy(secret, nc->shared_secret, key_len);
+ rcu_read_unlock();
- desc.tfm = mdev->cram_hmac_tfm;
+ desc.tfm = connection->cram_hmac_tfm;
desc.flags = 0;
- rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
- (u8 *)mdev->net_conf->shared_secret, key_len);
+ rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
if (rv) {
- dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+ drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
rv = -1;
goto fail;
}
get_random_bytes(my_challenge, CHALLENGE_LEN);
- rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+ sock = &connection->data;
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+ my_challenge, CHALLENGE_LEN);
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
+ rv = 0;
goto fail;
+ }
- if (cmd != P_AUTH_CHALLENGE) {
- dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ if (pi.cmd != P_AUTH_CHALLENGE) {
+ drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
rv = 0;
goto fail;
}
- if (length > CHALLENGE_LEN * 2) {
- dev_err(DEV, "expected AuthChallenge payload too big.\n");
+ if (pi.size > CHALLENGE_LEN * 2) {
+ drbd_err(connection, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
- peers_ch = kmalloc(length, GFP_NOIO);
- if (peers_ch == NULL) {
- dev_err(DEV, "kmalloc of peers_ch failed\n");
+ if (pi.size < CHALLENGE_LEN) {
+ drbd_err(connection, "AuthChallenge payload too small.\n");
rv = -1;
goto fail;
}
- rv = drbd_recv(mdev, peers_ch, length);
+ peers_ch = kmalloc(pi.size, GFP_NOIO);
+ if (peers_ch == NULL) {
+ drbd_err(connection, "kmalloc of peers_ch failed\n");
+ rv = -1;
+ goto fail;
+ }
- if (rv != length) {
- dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
+ err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+ if (err) {
rv = 0;
goto fail;
}
- resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+ if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+ drbd_err(connection, "Peer presented the same challenge!\n");
+ rv = -1;
+ goto fail;
+ }
+
+ resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
response = kmalloc(resp_size, GFP_NOIO);
if (response == NULL) {
- dev_err(DEV, "kmalloc of response failed\n");
+ drbd_err(connection, "kmalloc of response failed\n");
rv = -1;
goto fail;
}
sg_init_table(&sg, 1);
- sg_set_buf(&sg, peers_ch, length);
+ sg_set_buf(&sg, peers_ch, pi.size);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
- dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
rv = -1;
goto fail;
}
- rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
- if (!rv)
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
goto fail;
-
- rv = drbd_recv_header(mdev, &cmd, &length);
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+ response, resp_size);
if (!rv)
goto fail;
- if (cmd != P_AUTH_RESPONSE) {
- dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
rv = 0;
goto fail;
}
- if (length != resp_size) {
- dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+ if (pi.cmd != P_AUTH_RESPONSE) {
+ drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
rv = 0;
goto fail;
}
- rv = drbd_recv(mdev, response , resp_size);
+ if (pi.size != resp_size) {
+ drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+ rv = 0;
+ goto fail;
+ }
- if (rv != resp_size) {
- dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+ err = drbd_recv_all_warn(connection, response , resp_size);
+ if (err) {
rv = 0;
goto fail;
}
right_response = kmalloc(resp_size, GFP_NOIO);
if (right_response == NULL) {
- dev_err(DEV, "kmalloc of right_response failed\n");
+ drbd_err(connection, "kmalloc of right_response failed\n");
rv = -1;
goto fail;
}
@@ -4063,7 +4937,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
if (rv) {
- dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
rv = -1;
goto fail;
}
@@ -4071,8 +4945,8 @@ static int drbd_do_auth(struct drbd_conf *mdev)
rv = !memcmp(response, right_response, resp_size);
if (rv)
- dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
- resp_size, mdev->net_conf->cram_hmac_alg);
+ drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+ resp_size);
else
rv = -1;
@@ -4085,388 +4959,489 @@ static int drbd_do_auth(struct drbd_conf *mdev)
}
#endif
-int drbdd_init(struct drbd_thread *thi)
+int drbd_receiver(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
- unsigned int minor = mdev_to_minor(mdev);
+ struct drbd_connection *connection = thi->connection;
int h;
- sprintf(current->comm, "drbd%d_receiver", minor);
-
- dev_info(DEV, "receiver (re)started\n");
+ drbd_info(connection, "receiver (re)started\n");
do {
- h = drbd_connect(mdev);
+ h = conn_connect(connection);
if (h == 0) {
- drbd_disconnect(mdev);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
+ conn_disconnect(connection);
+ schedule_timeout_interruptible(HZ);
}
if (h == -1) {
- dev_warn(DEV, "Discarding network configuration.\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ drbd_warn(connection, "Discarding network configuration.\n");
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
}
} while (h == 0);
- if (h > 0) {
- if (get_net_conf(mdev)) {
- drbdd(mdev);
- put_net_conf(mdev);
- }
- }
+ if (h > 0)
+ drbdd(connection);
- drbd_disconnect(mdev);
+ conn_disconnect(connection);
- dev_info(DEV, "receiver terminated\n");
+ drbd_info(connection, "receiver terminated\n");
return 0;
}
/* ********* acknowledge sender ******** */
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+ } else {
+ set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+ drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&connection->ping_wait);
+
+ return 0;
+}
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state_reply *p = pi->data;
int retcode = be32_to_cpu(p->retcode);
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+ D_ASSERT(device, connection->agreed_pro_version < 100);
+ return got_conn_RqSReply(connection, pi);
+ }
+
if (retcode >= SS_SUCCESS) {
- set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+ set_bit(CL_ST_CHG_SUCCESS, &device->flags);
} else {
- set_bit(CL_ST_CHG_FAIL, &mdev->flags);
- dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
- drbd_set_st_err_str(retcode), retcode);
+ set_bit(CL_ST_CHG_FAIL, &device->flags);
+ drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
}
- wake_up(&mdev->state_wait);
+ wake_up(&device->state_wait);
- return TRUE;
+ return 0;
}
-static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
{
- return drbd_send_ping_ack(mdev);
+ return drbd_send_ping_ack(connection);
}
-static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
{
/* restore idle timeout */
- mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
- if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
- wake_up(&mdev->misc_wait);
+ connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+ if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+ wake_up(&connection->ping_wait);
- return TRUE;
+ return 0;
}
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
int blksize = be32_to_cpu(p->blksize);
- D_ASSERT(mdev->agreed_pro_version >= 89);
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
- if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, sector);
- drbd_set_in_sync(mdev, sector, blksize);
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, sector);
+ drbd_set_in_sync(device, sector, blksize);
/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
- mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
- put_ldev(mdev);
+ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(device);
}
- dec_rs_pending(mdev);
- atomic_add(blksize >> 9, &mdev->rs_sect_in);
-
- return TRUE;
-}
-
-/* when we receive the ACK for a write request,
- * verify that we actually know about it */
-static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
- u64 id, sector_t sector)
-{
- struct hlist_head *slot = tl_hash_slot(mdev, sector);
- struct hlist_node *n;
- struct drbd_request *req;
+ dec_rs_pending(device);
+ atomic_add(blksize >> 9, &device->rs_sect_in);
- hlist_for_each_entry(req, n, slot, colision) {
- if ((unsigned long)req == (unsigned long)id) {
- if (req->sector != sector) {
- dev_err(DEV, "_ack_id_to_req: found req %p but it has "
- "wrong sector (%llus versus %llus)\n", req,
- (unsigned long long)req->sector,
- (unsigned long long)sector);
- break;
- }
- return req;
- }
- }
- dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
- (void *)(unsigned long)id, (unsigned long long)sector);
- return NULL;
+ return 0;
}
-typedef struct drbd_request *(req_validator_fn)
- (struct drbd_conf *mdev, u64 id, sector_t sector);
-
-static int validate_req_change_req_state(struct drbd_conf *mdev,
- u64 id, sector_t sector, req_validator_fn validator,
- const char *func, enum drbd_req_event what)
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+ struct rb_root *root, const char *func,
+ enum drbd_req_event what, bool missing_ok)
{
struct drbd_request *req;
struct bio_and_error m;
- spin_lock_irq(&mdev->req_lock);
- req = validator(mdev, id, sector);
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, root, id, sector, missing_ok, func);
if (unlikely(!req)) {
- spin_unlock_irq(&mdev->req_lock);
- dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
- return FALSE;
+ spin_unlock_irq(&device->resource->req_lock);
+ return -EIO;
}
__req_mod(req, what, &m);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&device->resource->req_lock);
if (m.bio)
- complete_master_bio(mdev, &m);
- return TRUE;
+ complete_master_bio(device, &m);
+ return 0;
}
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
int blksize = be32_to_cpu(p->blksize);
enum drbd_req_event what;
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
- if (is_syncer_block_id(p->block_id)) {
- drbd_set_in_sync(mdev, sector, blksize);
- dec_rs_pending(mdev);
- return TRUE;
+ if (p->block_id == ID_SYNCER) {
+ drbd_set_in_sync(device, sector, blksize);
+ dec_rs_pending(device);
+ return 0;
}
- switch (be16_to_cpu(h->command)) {
+ switch (pi->cmd) {
case P_RS_WRITE_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = write_acked_by_peer_and_sis;
+ what = WRITE_ACKED_BY_PEER_AND_SIS;
break;
case P_WRITE_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = write_acked_by_peer;
+ what = WRITE_ACKED_BY_PEER;
break;
case P_RECV_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
- what = recv_acked_by_peer;
+ what = RECV_ACKED_BY_PEER;
+ break;
+ case P_SUPERSEDED:
+ what = CONFLICT_RESOLVED;
break;
- case P_DISCARD_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = conflict_discarded_by_peer;
+ case P_RETRY_WRITE:
+ what = POSTPONE_WRITE;
break;
default:
- D_ASSERT(0);
- return FALSE;
+ BUG();
}
- return validate_req_change_req_state(mdev, p->block_id, sector,
- _ack_id_to_req, __func__ , what);
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ what, false);
}
-static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
+ int size = be32_to_cpu(p->blksize);
+ int err;
- if (__ratelimit(&drbd_ratelimit_state))
- dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
- if (is_syncer_block_id(p->block_id)) {
- int size = be32_to_cpu(p->blksize);
- dec_rs_pending(mdev);
- drbd_rs_failed_io(mdev, sector, size);
- return TRUE;
+ if (p->block_id == ID_SYNCER) {
+ dec_rs_pending(device);
+ drbd_rs_failed_io(device, sector, size);
+ return 0;
+ }
+
+ err = validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ NEG_ACKED, true);
+ if (err) {
+ /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+ The master bio might already be completed, therefore the
+ request is no longer in the collision hash. */
+ /* In Protocol B we might already have got a P_RECV_ACK
+ but then get a P_NEG_ACK afterwards. */
+ drbd_set_out_of_sync(device, sector, size);
}
- return validate_req_change_req_state(mdev, p->block_id, sector,
- _ack_id_to_req, __func__ , neg_acked);
+ return 0;
}
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
(unsigned long long)sector, be32_to_cpu(p->blksize));
- return validate_req_change_req_state(mdev, p->block_id, sector,
- _ar_id_to_req, __func__ , neg_acked);
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->read_requests, __func__,
+ NEG_ACKED, false);
}
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
sector_t sector;
int size;
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct p_block_ack *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
- dec_rs_pending(mdev);
+ dec_rs_pending(device);
- if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_rs_complete_io(mdev, sector);
- drbd_rs_failed_io(mdev, sector, size);
- put_ldev(mdev);
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, sector);
+ switch (pi->cmd) {
+ case P_NEG_RS_DREPLY:
+ drbd_rs_failed_io(device, sector, size);
+ case P_RS_CANCEL:
+ break;
+ default:
+ BUG();
+ }
+ put_ldev(device);
}
- return TRUE;
+ return 0;
}
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+ struct p_barrier_ack *p = pi->data;
+ struct drbd_peer_device *peer_device;
+ int vnr;
- tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+ tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
- return TRUE;
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (device->state.conn == C_AHEAD &&
+ atomic_read(&device->ap_in_flight) == 0 &&
+ !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+ device->start_resync_timer.expires = jiffies + HZ;
+ add_timer(&device->start_resync_timer);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
}
-static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
- struct drbd_work *w;
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ struct drbd_device_work *dw;
sector_t sector;
int size;
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
- drbd_ov_oos_found(mdev, sector, size);
+ drbd_ov_out_of_sync_found(device, sector, size);
else
- ov_oos_print(mdev);
+ ov_out_of_sync_print(device);
- if (!get_ldev(mdev))
- return TRUE;
+ if (!get_ldev(device))
+ return 0;
- drbd_rs_complete_io(mdev, sector);
- dec_rs_pending(mdev);
+ drbd_rs_complete_io(device, sector);
+ dec_rs_pending(device);
- if (--mdev->ov_left == 0) {
- w = kmalloc(sizeof(*w), GFP_NOIO);
- if (w) {
- w->cb = w_ov_finished;
- drbd_queue_work_front(&mdev->data.work, w);
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ if (device->ov_left == 0) {
+ dw = kmalloc(sizeof(*dw), GFP_NOIO);
+ if (dw) {
+ dw->w.cb = w_ov_finished;
+ dw->device = device;
+ drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
} else {
- dev_err(DEV, "kmalloc(w) failed.");
- ov_oos_print(mdev);
- drbd_resync_finished(mdev);
+ drbd_err(device, "kmalloc(dw) failed.");
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
}
}
- put_ldev(mdev);
- return TRUE;
+ put_ldev(device);
+ return 0;
}
-static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
{
- return TRUE;
+ return 0;
+}
+
+static int connection_finish_peer_reqs(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr, not_empty = 0;
+
+ do {
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+ flush_signals(current);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ if (drbd_finish_peer_reqs(device)) {
+ kref_put(&device->kref, drbd_destroy_device);
+ return 1;
+ }
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ set_bit(SIGNAL_ASENDER, &connection->flags);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ not_empty = !list_empty(&device->done_ee);
+ if (not_empty)
+ break;
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+ rcu_read_unlock();
+ } while (not_empty);
+
+ return 0;
}
struct asender_cmd {
size_t pkt_size;
- int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
+ int (*fn)(struct drbd_connection *connection, struct packet_info *);
};
-static struct asender_cmd *get_asender_cmd(int cmd)
-{
- static struct asender_cmd asender_tbl[] = {
- /* anything missing from this table is in
- * the drbd_cmd_handler (drbd_default_handler) table,
- * see the beginning of drbdd() */
- [P_PING] = { sizeof(struct p_header80), got_Ping },
- [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
+static struct asender_cmd asender_tbl[] = {
+ [P_PING] = { 0, got_Ping },
+ [P_PING_ACK] = { 0, got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
- [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
[P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
[P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
- [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
+ [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
[P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
[P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
- [P_MAX_CMD] = { 0, NULL },
- };
- if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
- return NULL;
- return &asender_tbl[cmd];
-}
+ [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+ [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
+};
int drbd_asender(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
- struct p_header80 *h = &mdev->meta.rbuf.header.h80;
+ struct drbd_connection *connection = thi->connection;
struct asender_cmd *cmd = NULL;
-
- int rv, len;
- void *buf = h;
+ struct packet_info pi;
+ int rv;
+ void *buf = connection->meta.rbuf;
int received = 0;
- int expect = sizeof(struct p_header80);
- int empty;
-
- sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
-
- current->policy = SCHED_RR; /* Make this a realtime task! */
- current->rt_priority = 2; /* more important than all other tasks */
-
- while (get_t_state(thi) == Running) {
- drbd_thread_current_set_cpu(mdev);
- if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
- ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
- mdev->meta.socket->sk->sk_rcvtimeo =
- mdev->net_conf->ping_timeo*HZ/10;
+ unsigned int header_size = drbd_header_size(connection);
+ int expect = header_size;
+ bool ping_timeout_active = false;
+ struct net_conf *nc;
+ int ping_timeo, tcp_cork, ping_int;
+ struct sched_param param = { .sched_priority = 2 };
+
+ rv = sched_setscheduler(current, SCHED_RR, &param);
+ if (rv < 0)
+ drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ ping_timeo = nc->ping_timeo;
+ tcp_cork = nc->tcp_cork;
+ ping_int = nc->ping_int;
+ rcu_read_unlock();
+
+ if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+ if (drbd_send_ping(connection)) {
+ drbd_err(connection, "drbd_send_ping has failed\n");
+ goto reconnect;
+ }
+ connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+ ping_timeout_active = true;
}
- /* conditionally cork;
- * it may hurt latency if we cork without much to send */
- if (!mdev->net_conf->no_cork &&
- 3 < atomic_read(&mdev->unacked_cnt))
- drbd_tcp_cork(mdev->meta.socket);
- while (1) {
- clear_bit(SIGNAL_ASENDER, &mdev->flags);
- flush_signals(current);
- if (!drbd_process_done_ee(mdev))
- goto reconnect;
- /* to avoid race with newly queued ACKs */
- set_bit(SIGNAL_ASENDER, &mdev->flags);
- spin_lock_irq(&mdev->req_lock);
- empty = list_empty(&mdev->done_ee);
- spin_unlock_irq(&mdev->req_lock);
- /* new ack may have been queued right here,
- * but then there is also a signal pending,
- * and we start over... */
- if (empty)
- break;
+ /* TODO: conditionally cork; it may hurt latency if we cork without
+ much to send */
+ if (tcp_cork)
+ drbd_tcp_cork(connection->meta.socket);
+ if (connection_finish_peer_reqs(connection)) {
+ drbd_err(connection, "connection_finish_peer_reqs() failed\n");
+ goto reconnect;
}
/* but unconditionally uncork unless disabled */
- if (!mdev->net_conf->no_cork)
- drbd_tcp_uncork(mdev->meta.socket);
+ if (tcp_cork)
+ drbd_tcp_uncork(connection->meta.socket);
/* short circuit, recv_msg would return EINTR anyways. */
if (signal_pending(current))
continue;
- rv = drbd_recv_short(mdev, mdev->meta.socket,
- buf, expect-received, 0);
- clear_bit(SIGNAL_ASENDER, &mdev->flags);
+ rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
flush_signals(current);
@@ -4484,70 +5459,91 @@ int drbd_asender(struct drbd_thread *thi)
received += rv;
buf += rv;
} else if (rv == 0) {
- dev_err(DEV, "meta connection shut down by peer.\n");
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(connection->ping_wait,
+ connection->cstate < C_WF_REPORT_PARAMS,
+ t);
+ if (t)
+ break;
+ }
+ drbd_err(connection, "meta connection shut down by peer.\n");
goto reconnect;
} else if (rv == -EAGAIN) {
- if (mdev->meta.socket->sk->sk_rcvtimeo ==
- mdev->net_conf->ping_timeo*HZ/10) {
- dev_err(DEV, "PingAck did not arrive in time.\n");
+ /* If the data socket received something meanwhile,
+ * that is good enough: peer is still alive. */
+ if (time_after(connection->last_received,
+ jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+ continue;
+ if (ping_timeout_active) {
+ drbd_err(connection, "PingAck did not arrive in time.\n");
goto reconnect;
}
- set_bit(SEND_PING, &mdev->flags);
+ set_bit(SEND_PING, &connection->flags);
continue;
} else if (rv == -EINTR) {
continue;
} else {
- dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
goto reconnect;
}
if (received == expect && cmd == NULL) {
- if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->magic),
- be16_to_cpu(h->command),
- be16_to_cpu(h->length));
+ if (decode_header(connection, connection->meta.rbuf, &pi))
goto reconnect;
- }
- cmd = get_asender_cmd(be16_to_cpu(h->command));
- len = be16_to_cpu(h->length);
- if (unlikely(cmd == NULL)) {
- dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->magic),
- be16_to_cpu(h->command),
- be16_to_cpu(h->length));
+ cmd = &asender_tbl[pi.cmd];
+ if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+ drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
goto disconnect;
}
- expect = cmd->pkt_size;
- ERR_IF(len != expect-sizeof(struct p_header80))
+ expect = header_size + cmd->pkt_size;
+ if (pi.size != expect - header_size) {
+ drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+ pi.cmd, pi.size);
goto reconnect;
+ }
}
if (received == expect) {
- D_ASSERT(cmd != NULL);
- if (!cmd->process(mdev, h))
+ bool err;
+
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "%pf failed\n", cmd->fn);
goto reconnect;
+ }
+
+ connection->last_received = jiffies;
+
+ if (cmd == &asender_tbl[P_PING_ACK]) {
+ /* restore idle timeout */
+ connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+ ping_timeout_active = false;
+ }
- buf = h;
+ buf = connection->meta.rbuf;
received = 0;
- expect = sizeof(struct p_header80);
+ expect = header_size;
cmd = NULL;
}
}
if (0) {
reconnect:
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
- drbd_md_sync(mdev);
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ conn_md_sync(connection);
}
if (0) {
disconnect:
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- drbd_md_sync(mdev);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
}
- clear_bit(SIGNAL_ASENDER, &mdev->flags);
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
- D_ASSERT(mdev->state.conn < C_CONNECTED);
- dev_info(DEV, "asender terminated\n");
+ drbd_info(connection, "asender terminated\n");
return 0;
}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 11a75d32a2e..09803d0d520 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -31,34 +31,81 @@
#include "drbd_req.h"
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
/* Update disk stats at start of I/O request */
-static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
{
- const int rw = bio_data_dir(bio);
+ const int rw = bio_data_dir(req->master_bio);
int cpu;
cpu = part_stat_lock();
- part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
- part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
- part_inc_in_flight(&mdev->vdisk->part0, rw);
+ part_round_stats(cpu, &device->vdisk->part0);
+ part_stat_inc(cpu, &device->vdisk->part0, ios[rw]);
+ part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9);
+ (void) cpu; /* The macro invocations above want the cpu argument, I do not like
+ the compiler warning about cpu only assigned but never used... */
+ part_inc_in_flight(&device->vdisk->part0, rw);
part_stat_unlock();
}
/* Update disk stats when completing request upwards */
-static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
+static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
{
int rw = bio_data_dir(req->master_bio);
unsigned long duration = jiffies - req->start_time;
int cpu;
cpu = part_stat_lock();
- part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
- part_round_stats(cpu, &mdev->vdisk->part0);
- part_dec_in_flight(&mdev->vdisk->part0, rw);
+ part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
+ part_round_stats(cpu, &device->vdisk->part0);
+ part_dec_in_flight(&device->vdisk->part0, rw);
part_stat_unlock();
}
-static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+static struct drbd_request *drbd_req_new(struct drbd_device *device,
+ struct bio *bio_src)
+{
+ struct drbd_request *req;
+
+ req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ drbd_req_make_private_bio(req, bio_src);
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+ req->device = device;
+ req->master_bio = bio_src;
+ req->epoch = 0;
+
+ drbd_clear_interval(&req->i);
+ req->i.sector = bio_src->bi_iter.bi_sector;
+ req->i.size = bio_src->bi_iter.bi_size;
+ req->i.local = true;
+ req->i.waiting = false;
+
+ INIT_LIST_HEAD(&req->tl_requests);
+ INIT_LIST_HEAD(&req->w.list);
+
+ /* one reference to be put by __drbd_make_request */
+ atomic_set(&req->completion_ref, 1);
+ /* one kref as long as completion_ref > 0 */
+ kref_init(&req->kref);
+ return req;
+}
+
+void drbd_req_destroy(struct kref *kref)
{
- const unsigned long s = req->rq_state;
+ struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+ struct drbd_device *device = req->device;
+ const unsigned s = req->rq_state;
+
+ if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+ atomic_read(&req->completion_ref) ||
+ (s & RQ_LOCAL_PENDING) ||
+ ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+ drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+ s, atomic_read(&req->completion_ref));
+ return;
+ }
/* remove it from the transfer log.
* well, only if it had been there in the first
@@ -66,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
* and never sent), it should still be "empty" as
* initialized in drbd_req_new(), so we can list_del() it
* here unconditionally */
- list_del(&req->tl_requests);
+ list_del_init(&req->tl_requests);
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
- if (rw == WRITE) {
+ if (s & RQ_WRITE) {
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
* READ with local io-error */
- if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
- drbd_set_out_of_sync(mdev, req->sector, req->size);
- if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
- drbd_set_in_sync(mdev, req->sector, req->size);
+ /* There is a special case:
+ * we may notice late that IO was suspended,
+ * and postpone, or schedule for retry, a write,
+ * before it even was submitted or sent.
+ * In that case we do not want to touch the bitmap at all.
+ */
+ if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+ if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+ drbd_set_in_sync(device, req->i.sector, req->i.size);
+ }
/* one might be tempted to move the drbd_al_complete_io
- * to the local io completion callback drbd_endio_pri.
+ * to the local io completion callback drbd_request_endio.
* but, if this was a mirror write, we may only
* drbd_al_complete_io after this is RQ_NET_DONE,
* otherwise the extent could be dropped from the al
@@ -92,111 +148,57 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
* but after the extent has been dropped from the al,
* we would forget to resync the corresponding extent.
*/
- if (s & RQ_LOCAL_MASK) {
- if (get_ldev_if_state(mdev, D_FAILED)) {
- if (s & RQ_IN_ACT_LOG)
- drbd_al_complete_io(mdev, req->sector);
- put_ldev(mdev);
+ if (s & RQ_IN_ACT_LOG) {
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_al_complete_io(device, &req->i);
+ put_ldev(device);
} else if (__ratelimit(&drbd_ratelimit_state)) {
- dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
- "but my Disk seems to have failed :(\n",
- (unsigned long long) req->sector);
+ drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+ "but my Disk seems to have failed :(\n",
+ (unsigned long long) req->i.sector, req->i.size);
}
}
}
- drbd_req_free(req);
+ mempool_free(req, drbd_request_mempool);
}
-static void queue_barrier(struct drbd_conf *mdev)
+static void wake_all_senders(struct drbd_connection *connection)
{
- struct drbd_tl_epoch *b;
-
- /* We are within the req_lock. Once we queued the barrier for sending,
- * we set the CREATE_BARRIER bit. It is cleared as soon as a new
- * barrier/epoch object is added. This is the only place this bit is
- * set. It indicates that the barrier for this epoch is already queued,
- * and no new epoch has been created yet. */
- if (test_bit(CREATE_BARRIER, &mdev->flags))
- return;
-
- b = mdev->newest_tle;
- b->w.cb = w_send_barrier;
- /* inc_ap_pending done here, so we won't
- * get imbalanced on connection loss.
- * dec_ap_pending will be done in got_BarrierAck
- * or (on connection loss) in tl_clear. */
- inc_ap_pending(mdev);
- drbd_queue_work(&mdev->data.work, &b->w);
- set_bit(CREATE_BARRIER, &mdev->flags);
+ wake_up(&connection->sender_work.q_wait);
}
-static void _about_to_complete_local_write(struct drbd_conf *mdev,
- struct drbd_request *req)
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
{
- const unsigned long s = req->rq_state;
- struct drbd_request *i;
- struct drbd_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
-
- /* before we can signal completion to the upper layers,
- * we may need to close the current epoch */
- if (mdev->state.conn >= C_CONNECTED &&
- req->epoch == mdev->newest_tle->br_number)
- queue_barrier(mdev);
-
- /* we need to do the conflict detection stuff,
- * if we have the ee_hash (two_primaries) and
- * this has been on the network */
- if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
- const sector_t sector = req->sector;
- const int size = req->size;
-
- /* ASSERT:
- * there must be no conflicting requests, since
- * they must have been failed on the spot */
-#define OVERLAPS overlaps(sector, size, i->sector, i->size)
- slot = tl_hash_slot(mdev, sector);
- hlist_for_each_entry(i, n, slot, colision) {
- if (OVERLAPS) {
- dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
- "other: %p %llus +%u\n",
- req, (unsigned long long)sector, size,
- i, (unsigned long long)i->sector, i->size);
- }
- }
+ /* no point closing an epoch, if it is empty, anyways. */
+ if (connection->current_tle_writes == 0)
+ return;
- /* maybe "wake" those conflicting epoch entries
- * that wait for this request to finish.
- *
- * currently, there can be only _one_ such ee
- * (well, or some more, which would be pending
- * P_DISCARD_ACK not yet sent by the asender...),
- * since we block the receiver thread upon the
- * first conflict detection, which will wait on
- * misc_wait. maybe we want to assert that?
- *
- * anyways, if we found one,
- * we just have to do a wake_up. */
-#undef OVERLAPS
-#define OVERLAPS overlaps(sector, size, e->sector, e->size)
- slot = ee_hash_slot(mdev, req->sector);
- hlist_for_each_entry(e, n, slot, colision) {
- if (OVERLAPS) {
- wake_up(&mdev->misc_wait);
- break;
- }
- }
- }
-#undef OVERLAPS
+ connection->current_tle_writes = 0;
+ atomic_inc(&connection->current_tle_nr);
+ wake_all_senders(connection);
}
-void complete_master_bio(struct drbd_conf *mdev,
+void complete_master_bio(struct drbd_device *device,
struct bio_and_error *m)
{
bio_endio(m->bio, m->error);
- dec_ap_bio(mdev);
+ dec_ap_bio(device);
+}
+
+
+static void drbd_remove_request_interval(struct rb_root *root,
+ struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i = &req->i;
+
+ drbd_remove_interval(root, i);
+
+ /* Wake up any processes waiting for this request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
}
/* Helper for __req_mod().
@@ -205,12 +207,13 @@ void complete_master_bio(struct drbd_conf *mdev,
* if it has already been completed, or cannot be completed yet.
* If m->bio is set, the error status to be returned is placed in m->error.
*/
-void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
{
- const unsigned long s = req->rq_state;
- struct drbd_conf *mdev = req->mdev;
- /* only WRITES may end up here without a master bio (on barrier ack) */
- int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+ const unsigned s = req->rq_state;
+ struct drbd_device *device = req->device;
+ int rw;
+ int error, ok;
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -221,162 +224,219 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* the receiver,
* the bio_endio completion callbacks.
*/
- if (s & RQ_NET_QUEUED)
+ if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+ (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+ (s & RQ_COMPLETION_SUSP)) {
+ drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
return;
- if (s & RQ_NET_PENDING)
- return;
- if (s & RQ_LOCAL_PENDING)
+ }
+
+ if (!req->master_bio) {
+ drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
return;
+ }
- if (req->master_bio) {
- /* this is data_received (remote read)
- * or protocol C P_WRITE_ACK
- * or protocol B P_RECV_ACK
- * or protocol A "handed_over_to_network" (SendAck)
- * or canceled or failed,
- * or killed from the transfer log due to connection loss.
- */
+ rw = bio_rw(req->master_bio);
- /*
- * figure out whether to report success or failure.
- *
- * report success when at least one of the operations succeeded.
- * or, to put the other way,
- * only report failure, when both operations failed.
- *
- * what to do about the failures is handled elsewhere.
- * what we need to do here is just: complete the master_bio.
- *
- * local completion error, if any, has been stored as ERR_PTR
- * in private_bio within drbd_endio_pri.
- */
- int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
- int error = PTR_ERR(req->private_bio);
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the operations succeeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ *
+ * local completion error, if any, has been stored as ERR_PTR
+ * in private_bio within drbd_request_endio.
+ */
+ ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ error = PTR_ERR(req->private_bio);
- /* remove the request from the conflict detection
- * respective block_id verification hash */
- if (!hlist_unhashed(&req->colision))
- hlist_del(&req->colision);
- else
- D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+ /* remove the request from the conflict detection
+ * respective block_id verification hash */
+ if (!drbd_interval_empty(&req->i)) {
+ struct rb_root *root;
- /* for writes we need to do some extra housekeeping */
if (rw == WRITE)
- _about_to_complete_local_write(mdev, req);
+ root = &device->write_requests;
+ else
+ root = &device->read_requests;
+ drbd_remove_request_interval(root, req);
+ }
+
+ /* Before we can signal completion to the upper layers,
+ * we may need to close the current transfer log epoch.
+ * We are within the request lock, so we can simply compare
+ * the request epoch number with the current transfer log
+ * epoch number. If they match, increase the current_tle_nr,
+ * and reset the transfer log epoch write_cnt.
+ */
+ if (rw == WRITE &&
+ req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+ start_new_tl_epoch(first_peer_device(device)->connection);
- /* Update disk stats */
- _drbd_end_io_acct(mdev, req);
+ /* Update disk stats */
+ _drbd_end_io_acct(device, req);
+
+ /* If READ failed,
+ * have it be pushed back to the retry work queue,
+ * so it will re-enter __drbd_make_request(),
+ * and be re-assigned to a suitable local or remote path,
+ * or failed if we do not have access to good data anymore.
+ *
+ * Unless it was failed early by __drbd_make_request(),
+ * because no path was available, in which case
+ * it was not even added to the transfer_log.
+ *
+ * READA may fail, and will not be retried.
+ *
+ * WRITE should have used all available paths already.
+ */
+ if (!ok && rw == READ && !list_empty(&req->tl_requests))
+ req->rq_state |= RQ_POSTPONED;
+ if (!(req->rq_state & RQ_POSTPONED)) {
m->error = ok ? 0 : (error ?: -EIO);
m->bio = req->master_bio;
req->master_bio = NULL;
}
-
- if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
- /* this is disconnected (local only) operation,
- * or protocol C P_WRITE_ACK,
- * or protocol A or B P_BARRIER_ACK,
- * or killed from the transfer log due to connection loss. */
- _req_is_done(mdev, req, rw);
- }
- /* else: network part and not DONE yet. that is
- * protocol A or B, barrier ack still pending... */
}
-static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
{
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_device *device = req->device;
+ D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+ if (!atomic_sub_and_test(put, &req->completion_ref))
+ return 0;
- if (!is_susp(mdev->state))
- _req_may_be_done(req, m);
+ drbd_req_complete(req, m);
+
+ if (req->rq_state & RQ_POSTPONED) {
+ /* don't destroy the req object just yet,
+ * but queue it for retry */
+ drbd_restart_request(req);
+ return 0;
+ }
+
+ return 1;
}
-/*
- * checks whether there was an overlapping request
- * or ee already registered.
- *
- * if so, return 1, in which case this request is completed on the spot,
- * without ever being submitted or send.
- *
- * return 0 if it is ok to submit this request.
- *
- * NOTE:
- * paranoia: assume something above us is broken, and issues different write
- * requests for the same block simultaneously...
- *
- * To ensure these won't be reordered differently on both nodes, resulting in
- * diverging data sets, we discard the later one(s). Not that this is supposed
- * to happen, but this is the rationale why we also have to check for
- * conflicting requests with local origin, and why we have to do so regardless
- * of whether we allowed multiple primaries.
- *
- * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
- * second hlist_for_each_entry becomes a noop. This is even simpler than to
- * grab a reference on the net_conf, and check for the two_primaries flag...
- */
-static int _req_conflicts(struct drbd_request *req)
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+ int clear, int set)
{
- struct drbd_conf *mdev = req->mdev;
- const sector_t sector = req->sector;
- const int size = req->size;
- struct drbd_request *i;
- struct drbd_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
+ struct drbd_device *device = req->device;
+ unsigned s = req->rq_state;
+ int c_put = 0;
+ int k_put = 0;
- D_ASSERT(hlist_unhashed(&req->colision));
+ if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+ set |= RQ_COMPLETION_SUSP;
- if (!get_net_conf(mdev))
- return 0;
+ /* apply */
- /* BUG_ON */
- ERR_IF (mdev->tl_hash_s == 0)
- goto out_no_conflict;
- BUG_ON(mdev->tl_hash == NULL);
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev, sector);
- hlist_for_each_entry(i, n, slot, colision) {
- if (OVERLAPS) {
- dev_alert(DEV, "%s[%u] Concurrent local write detected! "
- "[DISCARD L] new: %llus +%u; "
- "pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- goto out_conflict;
- }
+ req->rq_state &= ~clear;
+ req->rq_state |= set;
+
+ /* no change? */
+ if (req->rq_state == s)
+ return;
+
+ /* intent: get references */
+
+ if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+ inc_ap_pending(device);
+ atomic_inc(&req->completion_ref);
}
- if (mdev->ee_hash_s) {
- /* now, check for overlapping requests with remote origin */
- BUG_ON(mdev->ee_hash == NULL);
-#undef OVERLAPS
-#define OVERLAPS overlaps(e->sector, e->size, sector, size)
- slot = ee_hash_slot(mdev, sector);
- hlist_for_each_entry(e, n, slot, colision) {
- if (OVERLAPS) {
- dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
- " [DISCARD L] new: %llus +%u; "
- "pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)e->sector, e->size);
- goto out_conflict;
- }
- }
+ if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+ kref_get(&req->kref); /* wait for the DONE */
+
+ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
+ atomic_add(req->i.size >> 9, &device->ap_in_flight);
+
+ if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+ atomic_inc(&req->completion_ref);
+
+ /* progress: put references */
+
+ if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+ ++c_put;
+
+ if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+ D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+ /* local completion may still come in later,
+ * we need to keep the req object around. */
+ kref_get(&req->kref);
+ ++c_put;
+ }
+
+ if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+ if (req->rq_state & RQ_LOCAL_ABORTED)
+ ++k_put;
+ else
+ ++c_put;
}
-#undef OVERLAPS
-out_no_conflict:
- /* this is like it should be, and what we expected.
- * our users do behave after all... */
- put_net_conf(mdev);
- return 0;
+ if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+ dec_ap_pending(device);
+ ++c_put;
+ }
-out_conflict:
- put_net_conf(mdev);
- return 1;
+ if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+ ++c_put;
+
+ if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+ if (req->rq_state & RQ_NET_SENT)
+ atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+ ++k_put;
+ }
+
+ /* potentially complete and destroy */
+
+ if (k_put || c_put) {
+ /* Completion does it's own kref_put. If we are going to
+ * kref_sub below, we need req to be still around then. */
+ int at_least = k_put + !!c_put;
+ int refcount = atomic_read(&req->kref.refcount);
+ if (refcount < at_least)
+ drbd_err(device,
+ "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+ s, req->rq_state, refcount, at_least);
+ }
+
+ /* If we made progress, retry conflicting peer requests, if any. */
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+
+ if (c_put)
+ k_put += drbd_req_put_completion_ref(req, m, c_put);
+ if (k_put)
+ kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+
+ drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
+ (req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+ (unsigned long long)req->i.sector,
+ req->i.size >> 9,
+ bdevname(device->ldev->backing_bdev, b));
}
/* obviously this could be coded as many single functions
@@ -394,116 +454,113 @@ out_conflict:
int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
- struct drbd_conf *mdev = req->mdev;
- int rv = 0;
- m->bio = NULL;
+ struct drbd_device *device = req->device;
+ struct net_conf *nc;
+ int p, rv = 0;
+
+ if (m)
+ m->bio = NULL;
switch (what) {
default:
- dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+ drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
break;
/* does not happen...
* initialization done in drbd_req_new
- case created:
+ case CREATED:
break;
*/
- case to_be_send: /* via network */
- /* reached via drbd_make_request_common
+ case TO_BE_SENT: /* via network */
+ /* reached via __drbd_make_request
* and from w_read_retry_remote */
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
+ D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ p = nc->wire_protocol;
+ rcu_read_unlock();
+ req->rq_state |=
+ p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+ p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+ mod_rq_state(req, m, 0, RQ_NET_PENDING);
break;
- case to_be_submitted: /* locally */
- /* reached via drbd_make_request_common */
- D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
- req->rq_state |= RQ_LOCAL_PENDING;
+ case TO_BE_SUBMITTED: /* locally */
+ /* reached via __drbd_make_request */
+ D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+ mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
break;
- case completed_ok:
- if (bio_data_dir(req->master_bio) == WRITE)
- mdev->writ_cnt += req->size>>9;
+ case COMPLETED_OK:
+ if (req->rq_state & RQ_WRITE)
+ device->writ_cnt += req->i.size >> 9;
else
- mdev->read_cnt += req->size>>9;
-
- req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
- req->rq_state &= ~RQ_LOCAL_PENDING;
+ device->read_cnt += req->i.size >> 9;
- _req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING,
+ RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
break;
- case write_completed_with_error:
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- __drbd_chk_io_error(mdev, FALSE);
- _req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
+ case ABORT_DISK_IO:
+ mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
break;
- case read_ahead_completed_with_error:
- /* it is legal to fail READA */
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done_not_susp(req, m);
- put_ldev(mdev);
+ case WRITE_COMPLETED_WITH_ERROR:
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
- case read_completed_with_error:
- drbd_set_out_of_sync(mdev, req->sector, req->size);
-
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-
- __drbd_chk_io_error(mdev, FALSE);
- put_ldev(mdev);
-
- /* no point in retrying if there is no good remote data,
- * or we have no connection. */
- if (mdev->state.pdsk != D_UP_TO_DATE) {
- _req_may_be_done_not_susp(req, m);
- break;
- }
+ case READ_COMPLETED_WITH_ERROR:
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ /* fall through. */
+ case READ_AHEAD_COMPLETED_WITH_ERROR:
+ /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
- /* _req_mod(req,to_be_send); oops, recursion... */
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
- /* fall through: _req_mod(req,queue_for_net_read); */
+ case DISCARD_COMPLETED_NOTSUPP:
+ case DISCARD_COMPLETED_WITH_ERROR:
+ /* I'd rather not detach from local disk just because it
+ * failed a REQ_DISCARD. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
- case queue_for_net_read:
+ case QUEUE_FOR_NET_READ:
/* READ or READA, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
- /* from drbd_make_request_common
+ /* from __drbd_make_request
* or from bio_endio during read io-error recovery */
- /* so we can verify the handle in the answer packet
- * corresponding hlist_del is in _req_may_be_done() */
- hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
+ /* So we can verify the handle in the answer packet.
+ * Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->read_requests, &req->i);
- set_bit(UNPLUG_REMOTE, &mdev->flags);
+ set_bit(UNPLUG_REMOTE, &device->flags);
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
- req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
- ? w_read_retry_remote
- : w_send_read_req;
- drbd_queue_work(&mdev->data.work, &req->w);
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_read_req;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &req->w);
break;
- case queue_for_net_write:
+ case QUEUE_FOR_NET_WRITE:
/* assert something? */
- /* from drbd_make_request_common only */
+ /* from __drbd_make_request only */
- hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
- /* corresponding hlist_del is in _req_may_be_done() */
+ /* Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->write_requests, &req->i);
/* NOTE
* In case the req ended up on the transfer log before being
@@ -514,7 +571,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
*
* _req_add_to_epoch(req); this has to be after the
* _maybe_start_new_epoch(req); which happened in
- * drbd_make_request_common, because we now may set the bit
+ * __drbd_make_request, because we now may set the bit
* again ourselves to close the current epoch.
*
* Add req to the (now) current epoch (barrier). */
@@ -522,185 +579,198 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* otherwise we may lose an unplug, which may cause some remote
* io-scheduler timeout to expire, increasing maximum latency,
* hurting performance. */
- set_bit(UNPLUG_REMOTE, &mdev->flags);
-
- /* see drbd_make_request_common,
- * just after it grabs the req_lock */
- D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
-
- req->epoch = mdev->newest_tle->br_number;
-
- /* increment size of current epoch */
- mdev->newest_tle->n_writes++;
+ set_bit(UNPLUG_REMOTE, &device->flags);
/* queue work item to send data */
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
req->w.cb = w_send_dblock;
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &req->w);
/* close the epoch, in case it outgrew the limit */
- if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
- queue_barrier(mdev);
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ p = nc->max_epoch_size;
+ rcu_read_unlock();
+ if (first_peer_device(device)->connection->current_tle_writes >= p)
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ break;
+ case QUEUE_FOR_SEND_OOS:
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_out_of_sync;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &req->w);
break;
- case send_canceled:
- /* treat it the same */
- case send_failed:
+ case READ_RETRY_REMOTE_CANCELED:
+ case SEND_CANCELED:
+ case SEND_FAILED:
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
- req->rq_state &= ~RQ_NET_QUEUED;
- /* if we did it right, tl_clear should be scheduled only after
- * this, so this should not be necessary! */
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_QUEUED, 0);
break;
- case handed_over_to_network:
+ case HANDED_OVER_TO_NETWORK:
/* assert something? */
if (bio_data_dir(req->master_bio) == WRITE &&
- mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+ !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer. */
- if (req->rq_state & RQ_NET_PENDING) {
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= RQ_NET_OK;
- } /* else: neg-ack was faster... */
+ if (req->rq_state & RQ_NET_PENDING)
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+ /* else: neg-ack was faster... */
/* it is still not yet RQ_NET_DONE until the
* corresponding epoch barrier got acked as well,
* so we know what to dirty on connection loss */
}
- req->rq_state &= ~RQ_NET_QUEUED;
- req->rq_state |= RQ_NET_SENT;
- /* because _drbd_send_zc_bio could sleep, and may want to
- * dereference the bio even after the "write_acked_by_peer" and
- * "completed_ok" events came in, once we return from
- * _drbd_send_zc_bio (drbd_send_dblock), we have to check
- * whether it is done already, and end it. */
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+ break;
+
+ case OOS_HANDED_TO_NETWORK:
+ /* Was not set PENDING, no longer QUEUED, so is now DONE
+ * as far as this connection is concerned. */
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
break;
- case read_retry_remote_canceled:
- req->rq_state &= ~RQ_NET_QUEUED;
- /* fall through, in case we raced with drbd_disconnect */
- case connection_lost_while_pending:
+ case CONNECTION_LOST_WHILE_PENDING:
/* transfer log cleanup after connection loss */
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING)
- dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
- req->rq_state |= RQ_NET_DONE;
- /* if it is still queued, we may not complete it here.
- * it will be canceled soon. */
- if (!(req->rq_state & RQ_NET_QUEUED))
- _req_may_be_done(req, m); /* Allowed while state.susp */
+ mod_rq_state(req, m,
+ RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+ RQ_NET_DONE);
break;
- case write_acked_by_peer_and_sis:
- req->rq_state |= RQ_NET_SIS;
- case conflict_discarded_by_peer:
- /* for discarded conflicting writes of multiple primaries,
+ case CONFLICT_RESOLVED:
+ /* for superseded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
- * node crashes are covered by the activity log. */
- if (what == conflict_discarded_by_peer)
- dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
- " DRBD is not a random data generator!\n",
- (unsigned long long)req->sector, req->size);
- req->rq_state |= RQ_NET_DONE;
- /* fall through */
- case write_acked_by_peer:
+ * node crashes are covered by the activity log.
+ *
+ * If this request had been marked as RQ_POSTPONED before,
+ * it will actually not be completed, but "restarted",
+ * resubmitted from the retry worker context. */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+ break;
+
+ case WRITE_ACKED_BY_PEER_AND_SIS:
+ req->rq_state |= RQ_NET_SIS;
+ case WRITE_ACKED_BY_PEER:
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
/* protocol C; successfully written on peer.
- * Nothing to do here.
+ * Nothing more to do here.
* We want to keep the tl in place for all protocols, to cater
- * for volatile write-back caches on lower level devices.
- *
- * A barrier request is expected to have forced all prior
- * requests onto stable storage, so completion of a barrier
- * request could set NET_DONE right here, and not wait for the
- * P_BARRIER_ACK, but that is an unnecessary optimization. */
+ * for volatile write-back caches on lower level devices. */
- /* this makes it effectively the same as for: */
- case recv_acked_by_peer:
+ goto ack_common;
+ case RECV_ACKED_BY_PEER:
+ D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
/* protocol B; pretends to be successfully written on peer.
- * see also notes above in handed_over_to_network about
+ * see also notes above in HANDED_OVER_TO_NETWORK about
* protocol != C */
- req->rq_state |= RQ_NET_OK;
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- _req_may_be_done_not_susp(req, m);
+ ack_common:
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
break;
- case neg_acked:
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING)
- dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+ case POSTPONE_WRITE:
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ /* If this node has already detected the write conflict, the
+ * worker will be waiting on misc_wait. Wake it up once this
+ * request has completed locally.
+ */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_POSTPONED;
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+ /* Do not clear RQ_NET_PENDING. This request will make further
+ * progress via restart_conflicting_writes() or
+ * fail_postponed_requests(). Hopefully. */
+ break;
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done_not_susp(req, m);
- /* else: done by handed_over_to_network */
+ case NEG_ACKED:
+ mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
break;
- case fail_frozen_disk_io:
+ case FAIL_FROZEN_DISK_IO:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
-
- _req_may_be_done(req, m); /* Allowed while state.susp */
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
break;
- case restart_frozen_disk_io:
+ case RESTART_FROZEN_DISK_IO:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
- req->rq_state &= ~RQ_LOCAL_COMPLETED;
+ mod_rq_state(req, m,
+ RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+ RQ_LOCAL_PENDING);
rv = MR_READ;
if (bio_data_dir(req->master_bio) == WRITE)
rv = MR_WRITE;
- get_ldev(mdev);
+ get_ldev(device); /* always succeeds in this call path */
req->w.cb = w_restart_disk_io;
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &req->w);
break;
- case resend:
+ case RESEND:
+ /* Simply complete (local only) READs. */
+ if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+ }
+
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
- before the connection loss (B&C only); only P_BARRIER_ACK was missing.
- Trowing them out of the TL here by pretending we got a BARRIER_ACK
- We ensure that the peer was not rebooted */
+ before the connection loss (B&C only); only P_BARRIER_ACK
+ (or the local completion?) was missing when we suspended.
+ Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+ During connection handshake, we ensure that the peer was not rebooted. */
if (!(req->rq_state & RQ_NET_OK)) {
+ /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+ * in that case we must not set RQ_NET_PENDING. */
+
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
if (req->w.cb) {
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &req->w);
rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
- }
+ } /* else: FIXME can this happen? */
break;
}
- /* else, fall through to barrier_acked */
+ /* else, fall through to BARRIER_ACKED */
- case barrier_acked:
+ case BARRIER_ACKED:
+ /* barrier ack for READ requests does not make sense */
if (!(req->rq_state & RQ_WRITE))
break;
if (req->rq_state & RQ_NET_PENDING) {
- /* barrier came in before all requests have been acked.
+ /* barrier came in before all requests were acked.
* this is bad, because if the connection is lost now,
* we won't be able to clean them up... */
- dev_err(DEV, "FIXME (barrier_acked but pending)\n");
- list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+ drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
}
- D_ASSERT(req->rq_state & RQ_NET_SENT);
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req, m); /* Allowed while state.susp */
+ /* Allowed to complete requests, even while suspended.
+ * As this is called for all requests within a matching epoch,
+ * we need to filter, and only set RQ_NET_DONE for those that
+ * have actually been on the wire. */
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+ (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
break;
- case data_received:
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
- _req_may_be_done_not_susp(req, m);
+ case DATA_RECEIVED:
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+ break;
+
+ case QUEUE_AS_DRBD_BARRIER:
+ start_new_tl_epoch(first_peer_device(device)->connection);
+ mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
break;
};
@@ -714,429 +784,670 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
* since size may be bigger than BM_BLOCK_SIZE,
* we may need to check several bits.
*/
-static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
{
unsigned long sbnr, ebnr;
sector_t esector, nr_sectors;
- if (mdev->state.disk == D_UP_TO_DATE)
- return 1;
- if (mdev->state.disk >= D_OUTDATED)
- return 0;
- if (mdev->state.disk < D_INCONSISTENT)
- return 0;
- /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
- nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ if (device->state.disk == D_UP_TO_DATE)
+ return true;
+ if (device->state.disk != D_INCONSISTENT)
+ return false;
esector = sector + (size >> 9) - 1;
-
- D_ASSERT(sector < nr_sectors);
- D_ASSERT(esector < nr_sectors);
+ nr_sectors = drbd_get_capacity(device->this_bdev);
+ D_ASSERT(device, sector < nr_sectors);
+ D_ASSERT(device, esector < nr_sectors);
sbnr = BM_SECT_TO_BIT(sector);
ebnr = BM_SECT_TO_BIT(esector);
- return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+ return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
}
-static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+ enum drbd_read_balancing rbm)
{
+ struct backing_dev_info *bdi;
+ int stripe_shift;
+
+ switch (rbm) {
+ case RB_CONGESTED_REMOTE:
+ bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+ return bdi_read_congested(bdi);
+ case RB_LEAST_PENDING:
+ return atomic_read(&device->local_cnt) >
+ atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+ case RB_32K_STRIPING: /* stripe_shift = 15 */
+ case RB_64K_STRIPING:
+ case RB_128K_STRIPING:
+ case RB_256K_STRIPING:
+ case RB_512K_STRIPING:
+ case RB_1M_STRIPING: /* stripe_shift = 20 */
+ stripe_shift = (rbm - RB_32K_STRIPING + 15);
+ return (sector >> (stripe_shift - 9)) & 1;
+ case RB_ROUND_ROBIN:
+ return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+ case RB_PREFER_REMOTE:
+ return true;
+ case RB_PREFER_LOCAL:
+ default:
+ return false;
+ }
+}
+
+/*
+ * complete_conflicting_writes - wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about. Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+ DEFINE_WAIT(wait);
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i;
+ sector_t sector = req->i.sector;
+ int size = req->i.size;
+
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ return;
+
+ for (;;) {
+ prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ break;
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ spin_unlock_irq(&device->resource->req_lock);
+ schedule();
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct net_conf *nc;
+ bool congested = false;
+ enum drbd_on_congestion on_congestion;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+ rcu_read_unlock();
+ if (on_congestion == OC_BLOCK ||
+ connection->agreed_pro_version < 96)
+ return;
+
+ /* If I don't even have good local storage, we can not reasonably try
+ * to pull ahead of the peer. We also need the local reference to make
+ * sure device->act_log is there.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ return;
+
+ if (nc->cong_fill &&
+ atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+ drbd_info(device, "Congestion-fill threshold reached\n");
+ congested = true;
+ }
+
+ if (device->act_log->used >= nc->cong_extents) {
+ drbd_info(device, "Congestion-extents threshold reached\n");
+ congested = true;
+ }
+
+ if (congested) {
+ /* start a new epoch for non-mirrored writes */
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ if (on_congestion == OC_PULL_AHEAD)
+ _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+ else /*nc->on_congestion == OC_DISCONNECT */
+ _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+ }
+ put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ enum drbd_read_balancing rbm;
+
+ if (req->private_bio) {
+ if (!drbd_may_do_local_read(device,
+ req->i.sector, req->i.size)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ }
+
+ if (device->state.pdsk != D_UP_TO_DATE)
+ return false;
+
+ if (req->private_bio == NULL)
+ return true;
+
+ /* TODO: improve read balancing decisions, take into account drbd
+ * protocol, pending requests etc. */
+
+ rcu_read_lock();
+ rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+ rcu_read_unlock();
+
+ if (rbm == RB_PREFER_LOCAL && req->private_bio)
+ return false; /* submit locally */
+
+ if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ int remote, send_oos;
+
+ remote = drbd_should_do_remote(device->state);
+ send_oos = drbd_should_send_out_of_sync(device->state);
+
+ /* Need to replicate writes. Unless it is an empty flush,
+ * which is better mapped to a DRBD P_BARRIER packet,
+ * also for drbd wire protocol compatibility reasons.
+ * If this was a flush, just start a new epoch.
+ * Unless the current epoch was empty anyways, or we are not currently
+ * replicating, in which case there is no point. */
+ if (unlikely(req->i.size == 0)) {
+ /* The only size==0 bios we expect are empty flushes. */
+ D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+ if (remote)
+ _req_mod(req, QUEUE_AS_DRBD_BARRIER);
+ return remote;
+ }
+
+ if (!remote && !send_oos)
+ return 0;
+
+ D_ASSERT(device, !(remote && send_oos));
+
+ if (remote) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_WRITE);
+ } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+ _req_mod(req, QUEUE_FOR_SEND_OOS);
+
+ return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->private_bio;
const int rw = bio_rw(bio);
- const int size = bio->bi_size;
- const sector_t sector = bio->bi_sector;
- struct drbd_tl_epoch *b = NULL;
+
+ bio->bi_bdev = device->ldev->backing_bdev;
+
+ /* State may have changed since we grabbed our reference on the
+ * ->ldev member. Double check, and short-circuit to endio.
+ * In case the last activity log transaction failed to get on
+ * stable storage, and this is a WRITE, we may not even submit
+ * this bio. */
+ if (get_ldev(device)) {
+ if (drbd_insert_fault(device,
+ rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+ put_ldev(device);
+ } else
+ bio_endio(bio, -EIO);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+ spin_lock(&device->submit.lock);
+ list_add_tail(&req->tl_requests, &device->submit.writes);
+ spin_unlock(&device->submit.lock);
+ queue_work(device->submit.wq, &device->submit.worker);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+{
+ const int rw = bio_data_dir(bio);
struct drbd_request *req;
- int local, remote;
- int err = -EIO;
- int ret = 0;
/* allocate outside of all locks; */
- req = drbd_req_new(mdev, bio);
+ req = drbd_req_new(device, bio);
if (!req) {
- dec_ap_bio(mdev);
+ dec_ap_bio(device);
/* only pass the error to the upper layers.
* if user cannot handle io errors, that's not our business. */
- dev_err(DEV, "could not kmalloc() req\n");
+ drbd_err(device, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
- return 0;
+ return ERR_PTR(-ENOMEM);
}
+ req->start_time = start_time;
- local = get_ldev(mdev);
- if (!local) {
- bio_put(req->private_bio); /* or we get a bio leak */
+ if (!get_ldev(device)) {
+ bio_put(req->private_bio);
req->private_bio = NULL;
}
- if (rw == WRITE) {
- remote = 1;
- } else {
- /* READ || READA */
- if (local) {
- if (!drbd_may_do_local_read(mdev, sector, size)) {
- /* we could kick the syncer to
- * sync this extent asap, wait for
- * it, then continue locally.
- * Or just issue the request remotely.
- */
- local = 0;
- bio_put(req->private_bio);
- req->private_bio = NULL;
- put_ldev(mdev);
- }
- }
- remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
- }
- /* If we have a disk, but a READA request is mapped to remote,
- * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
- * Just fail that READA request right here.
- *
- * THINK: maybe fail all READA when not local?
- * or make this configurable...
- * if network is slow, READA won't do any good.
- */
- if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
- err = -EWOULDBLOCK;
- goto fail_and_free_req;
- }
+ /* Update disk stats */
+ _drbd_start_io_acct(device, req);
- /* For WRITES going to the local disk, grab a reference on the target
- * extent. This waits for any resync activity in the corresponding
- * resync extent to finish, and, if necessary, pulls in the target
- * extent into the activity log, which involves further disk io because
- * of transactional on-disk meta data updates. */
- if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ if (rw == WRITE && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+ drbd_queue_write(device, req);
+ return NULL;
+ }
req->rq_state |= RQ_IN_ACT_LOG;
- drbd_al_begin_io(mdev, sector);
}
- remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
- (mdev->state.pdsk == D_INCONSISTENT &&
- mdev->state.conn >= C_CONNECTED));
+ return req;
+}
- if (!(local || remote) && !is_susp(mdev->state)) {
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
- goto fail_free_complete;
- }
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+ const int rw = bio_rw(req->master_bio);
+ struct bio_and_error m = { NULL, };
+ bool no_remote = false;
- /* For WRITE request, we have to make sure that we have an
- * unused_spare_tle, in case we need to start a new epoch.
- * I try to be smart and avoid to pre-allocate always "just in case",
- * but there is a race between testing the bit and pointer outside the
- * spinlock, and grabbing the spinlock.
- * if we lost that race, we retry. */
- if (rw == WRITE && remote &&
- mdev->unused_spare_tle == NULL &&
- test_bit(CREATE_BARRIER, &mdev->flags)) {
-allocate_barrier:
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
- if (!b) {
- dev_err(DEV, "Failed to alloc barrier.\n");
- err = -ENOMEM;
- goto fail_free_complete;
- }
+ spin_lock_irq(&device->resource->req_lock);
+ if (rw == WRITE) {
+ /* This may temporarily give up the req_lock,
+ * but will re-aquire it before it returns here.
+ * Needs to be before the check on drbd_suspended() */
+ complete_conflicting_writes(req);
+ /* no more giving up req_lock from now on! */
+
+ /* check for congestion, and potentially stop sending
+ * full data updates, but start sending "dirty bits" only. */
+ maybe_pull_ahead(device);
}
- /* GOOD, everything prepared, grab the spin_lock */
- spin_lock_irq(&mdev->req_lock);
-
- if (is_susp(mdev->state)) {
- /* If we got suspended, use the retry mechanism of
- generic_make_request() to restart processing of this
- bio. In the next call to drbd_make_request_26
- we sleep in inc_ap_bio() */
- ret = 1;
- spin_unlock_irq(&mdev->req_lock);
- goto fail_free_complete;
- }
- if (remote) {
- remote = (mdev->state.pdsk == D_UP_TO_DATE ||
- (mdev->state.pdsk == D_INCONSISTENT &&
- mdev->state.conn >= C_CONNECTED));
- if (!remote)
- dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
- if (!(local || remote)) {
- dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
- spin_unlock_irq(&mdev->req_lock);
- goto fail_free_complete;
+ if (drbd_suspended(device)) {
+ /* push back and retry: */
+ req->rq_state |= RQ_POSTPONED;
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
}
+ goto out;
}
- if (b && mdev->unused_spare_tle == NULL) {
- mdev->unused_spare_tle = b;
- b = NULL;
- }
- if (rw == WRITE && remote &&
- mdev->unused_spare_tle == NULL &&
- test_bit(CREATE_BARRIER, &mdev->flags)) {
- /* someone closed the current epoch
- * while we were grabbing the spinlock */
- spin_unlock_irq(&mdev->req_lock);
- goto allocate_barrier;
+ /* We fail READ/READA early, if we can not serve it.
+ * We must do this before req is registered on any lists.
+ * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+ if (rw != WRITE) {
+ if (!do_remote_read(req) && !req->private_bio)
+ goto nodata;
}
+ /* which transfer log epoch does this belong to? */
+ req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
- /* Update disk stats */
- _drbd_start_io_acct(mdev, req, bio);
+ /* no point in adding empty flushes to the transfer log,
+ * they are mapped to drbd barriers already. */
+ if (likely(req->i.size!=0)) {
+ if (rw == WRITE)
+ first_peer_device(device)->connection->current_tle_writes++;
- /* _maybe_start_new_epoch(mdev);
- * If we need to generate a write barrier packet, we have to add the
- * new epoch (barrier) object, and queue the barrier packet for sending,
- * and queue the req's data after it _within the same lock_, otherwise
- * we have race conditions were the reorder domains could be mixed up.
- *
- * Even read requests may start a new epoch and queue the corresponding
- * barrier packet. To get the write ordering right, we only have to
- * make sure that, if this is a write request and it triggered a
- * barrier packet, this request is queued within the same spinlock. */
- if (remote && mdev->unused_spare_tle &&
- test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
- _tl_add_barrier(mdev, mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
+ list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+ }
+
+ if (rw == WRITE) {
+ if (!drbd_process_write_request(req))
+ no_remote = true;
} else {
- D_ASSERT(!(remote && rw == WRITE &&
- test_bit(CREATE_BARRIER, &mdev->flags)));
+ /* We either have a private_bio, or we can read from remote.
+ * Otherwise we had done the goto nodata above. */
+ if (req->private_bio == NULL) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_READ);
+ } else
+ no_remote = true;
}
- /* NOTE
- * Actually, 'local' may be wrong here already, since we may have failed
- * to write to the meta data, and may become wrong anytime because of
- * local io-error for some other request, which would lead to us
- * "detaching" the local disk.
- *
- * 'remote' may become wrong any time because the network could fail.
- *
- * This is a harmless race condition, though, since it is handled
- * correctly at the appropriate places; so it just defers the failure
- * of the respective operation.
- */
+ if (req->private_bio) {
+ /* needs to be marked within the same spinlock */
+ _req_mod(req, TO_BE_SUBMITTED);
+ /* but we need to give up the spinlock to submit */
+ spin_unlock_irq(&device->resource->req_lock);
+ drbd_submit_req_private_bio(req);
+ spin_lock_irq(&device->resource->req_lock);
+ } else if (no_remote) {
+nodata:
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+ (unsigned long long)req->i.sector, req->i.size >> 9);
+ /* A write may have been queued for send_oos, however.
+ * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+ }
- /* mark them early for readability.
- * this just sets some state flags. */
- if (remote)
- _req_mod(req, to_be_send);
- if (local)
- _req_mod(req, to_be_submitted);
+out:
+ if (drbd_req_put_completion_ref(req, &m, 1))
+ kref_put(&req->kref, drbd_req_destroy);
+ spin_unlock_irq(&device->resource->req_lock);
- /* check this request on the collision detection hash tables.
- * if we have a conflict, just complete it here.
- * THINK do we want to check reads, too? (I don't think so...) */
- if (rw == WRITE && _req_conflicts(req))
- goto fail_conflicting;
+ if (m.bio)
+ complete_master_bio(device, &m);
+}
- list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+{
+ struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+ if (IS_ERR_OR_NULL(req))
+ return;
+ drbd_send_and_submit(device, req);
+}
- /* NOTE remote first: to get the concurrent write detection right,
- * we must register the request before start of local IO. */
- if (remote) {
- /* either WRITE and C_CONNECTED,
- * or READ, and no local disk,
- * or READ, but not in sync.
- */
- _req_mod(req, (rw == WRITE)
- ? queue_for_net_write
- : queue_for_net_read);
- }
- spin_unlock_irq(&mdev->req_lock);
- kfree(b); /* if someone else has beaten us to it... */
-
- if (local) {
- req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-
- /* State may have changed since we grabbed our reference on the
- * mdev->ldev member. Double check, and short-circuit to endio.
- * In case the last activity log transaction failed to get on
- * stable storage, and this is a WRITE, we may not even submit
- * this bio. */
- if (get_ldev(mdev)) {
- if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
- : rw == READ ? DRBD_FAULT_DT_RD
- : DRBD_FAULT_DT_RA))
- bio_endio(req->private_bio, -EIO);
- else
- generic_make_request(req->private_bio);
- put_ldev(mdev);
- } else
- bio_endio(req->private_bio, -EIO);
- }
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+ struct drbd_request *req, *tmp;
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ const int rw = bio_data_dir(req->master_bio);
- /* we need to plug ALWAYS since we possibly need to kick lo_dev.
- * we plug after submit, so we won't miss an unplug event */
- drbd_plug_device(mdev);
+ if (rw == WRITE /* rw != WRITE should not even end up here! */
+ && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i))
+ continue;
- return 0;
+ req->rq_state |= RQ_IN_ACT_LOG;
+ }
-fail_conflicting:
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- _drbd_end_io_acct(mdev, req);
- spin_unlock_irq(&mdev->req_lock);
- if (remote)
- dec_ap_pending(mdev);
- /* THINK: do we want to fail it (-EIO), or pretend success?
- * this pretends success. */
- err = 0;
-
-fail_free_complete:
- if (rw == WRITE && local)
- drbd_al_complete_io(mdev, sector);
-fail_and_free_req:
- if (local) {
- bio_put(req->private_bio);
- req->private_bio = NULL;
- put_ldev(mdev);
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
}
- if (!ret)
- bio_endio(bio, err);
+}
- drbd_req_free(req);
- dec_ap_bio(mdev);
- kfree(b);
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+ struct list_head *incoming,
+ struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+ int wake = 0;
+ int err;
+
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ err = drbd_al_begin_io_nonblock(device, &req->i);
+ if (err == -EBUSY)
+ wake = 1;
+ if (err)
+ continue;
+ req->rq_state |= RQ_IN_ACT_LOG;
+ list_move_tail(&req->tl_requests, pending);
+ }
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
- return ret;
+ return !list_empty(pending);
}
-/* helper function for drbd_make_request
- * if we can determine just by the mdev (state) that this request will fail,
- * return 1
- * otherwise return 0
- */
-static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
+void do_submit(struct work_struct *ws)
{
- if (mdev->state.role != R_PRIMARY &&
- (!allow_oos || is_write)) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "Process %s[%u] tried to %s; "
- "since we are not in Primary state, "
- "we cannot allow this\n",
- current->comm, current->pid,
- is_write ? "WRITE" : "READ");
+ struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+ LIST_HEAD(incoming);
+ LIST_HEAD(pending);
+ struct drbd_request *req, *tmp;
+
+ for (;;) {
+ spin_lock(&device->submit.lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock(&device->submit.lock);
+
+ submit_fast_path(device, &incoming);
+ if (list_empty(&incoming))
+ break;
+
+skip_fast_path:
+ wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
+ /* Maybe more was queued, while we prepared the transaction?
+ * Try to stuff them into this transaction as well.
+ * Be strictly non-blocking here, no wait_event, we already
+ * have something to commit.
+ * Stop if we don't make any more progres.
+ */
+ for (;;) {
+ LIST_HEAD(more_pending);
+ LIST_HEAD(more_incoming);
+ bool made_progress;
+
+ /* It is ok to look outside the lock,
+ * it's only an optimization anyways */
+ if (list_empty(&device->submit.writes))
+ break;
+
+ spin_lock(&device->submit.lock);
+ list_splice_tail_init(&device->submit.writes, &more_incoming);
+ spin_unlock(&device->submit.lock);
+
+ if (list_empty(&more_incoming))
+ break;
+
+ made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+
+ list_splice_tail_init(&more_pending, &pending);
+ list_splice_tail_init(&more_incoming, &incoming);
+
+ if (!made_progress)
+ break;
+ }
+ drbd_al_begin_io_commit(device, false);
+
+ list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
}
- return 1;
- }
- return 0;
+ /* If all currently hot activity log extents are kept busy by
+ * incoming requests, we still must not totally starve new
+ * requests to cold extents. In that case, prepare one request
+ * in blocking mode. */
+ list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+ list_del_init(&req->tl_requests);
+ req->rq_state |= RQ_IN_ACT_LOG;
+ if (!drbd_al_begin_io_prepare(device, &req->i)) {
+ /* Corresponding extent was hot after all? */
+ drbd_send_and_submit(device, req);
+ } else {
+ /* Found a request to a cold extent.
+ * Put on "pending" list,
+ * and try to cumulate with more. */
+ list_add(&req->tl_requests, &pending);
+ goto skip_fast_path;
+ }
+ }
+ }
}
-int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+void drbd_make_request(struct request_queue *q, struct bio *bio)
{
- unsigned int s_enr, e_enr;
- struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
+ unsigned long start_time;
- if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
- bio_endio(bio, -EPERM);
- return 0;
- }
+ start_time = jiffies;
/*
* what we "blindly" assume:
*/
- D_ASSERT(bio->bi_size > 0);
- D_ASSERT((bio->bi_size & 0x1ff) == 0);
- D_ASSERT(bio->bi_idx == 0);
-
- /* to make some things easier, force alignment of requests within the
- * granularity of our hash tables */
- s_enr = bio->bi_sector >> HT_SHIFT;
- e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
-
- if (likely(s_enr == e_enr)) {
- inc_ap_bio(mdev, 1);
- return drbd_make_request_common(mdev, bio);
- }
+ D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
- /* can this bio be split generically?
- * Maybe add our own split-arbitrary-bios function. */
- if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
- /* rather error out here than BUG in bio_split */
- dev_err(DEV, "bio would need to, but cannot, be split: "
- "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
- bio->bi_vcnt, bio->bi_idx, bio->bi_size,
- (unsigned long long)bio->bi_sector);
- bio_endio(bio, -EINVAL);
- } else {
- /* This bio crosses some boundary, so we have to split it. */
- struct bio_pair *bp;
- /* works for the "do not cross hash slot boundaries" case
- * e.g. sector 262269, size 4096
- * s_enr = 262269 >> 6 = 4097
- * e_enr = (262269+8-1) >> 6 = 4098
- * HT_SHIFT = 6
- * sps = 64, mask = 63
- * first_sectors = 64 - (262269 & 63) = 3
- */
- const sector_t sect = bio->bi_sector;
- const int sps = 1 << HT_SHIFT; /* sectors per slot */
- const int mask = sps - 1;
- const sector_t first_sectors = sps - (sect & mask);
- bp = bio_split(bio,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
- bio_split_pool,
-#endif
- first_sectors);
-
- /* we need to get a "reference count" (ap_bio_cnt)
- * to avoid races with the disconnect/reconnect/suspend code.
- * In case we need to split the bio here, we need to get three references
- * atomically, otherwise we might deadlock when trying to submit the
- * second one! */
- inc_ap_bio(mdev, 3);
-
- D_ASSERT(e_enr == s_enr + 1);
-
- while (drbd_make_request_common(mdev, &bp->bio1))
- inc_ap_bio(mdev, 1);
-
- while (drbd_make_request_common(mdev, &bp->bio2))
- inc_ap_bio(mdev, 1);
-
- dec_ap_bio(mdev);
-
- bio_pair_release(bp);
- }
- return 0;
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_time);
}
-/* This is called by bio_add_page(). With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
- * units (was AL_EXTENTs).
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
*
- * we do the calculation within the lower 32bit of the byte offsets,
- * since we don't care for actual offset, but only check whether it
- * would cross "activity log extent" boundaries.
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
*
* As long as the BIO is empty we have to allow at least one bvec,
- * regardless of size and offset. so the resulting bio may still
- * cross extent boundaries. those are dealt with (bio_split) in
- * drbd_make_request_26.
+ * regardless of size and offset, so no need to ask lower levels.
*/
int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
{
- struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
- unsigned int bio_offset =
- (unsigned int)bvm->bi_sector << 9; /* 32 bit */
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
unsigned int bio_size = bvm->bi_size;
- int limit, backing_limit;
-
- limit = DRBD_MAX_SEGMENT_SIZE
- - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
- if (limit < 0)
- limit = 0;
- if (bio_size == 0) {
- if (limit <= bvec->bv_len)
- limit = bvec->bv_len;
- } else if (limit && get_ldev(mdev)) {
+ int limit = DRBD_MAX_BIO_SIZE;
+ int backing_limit;
+
+ if (bio_size && get_ldev(device)) {
+ unsigned int max_hw_sectors = queue_max_hw_sectors(q);
struct request_queue * const b =
- mdev->ldev->backing_bdev->bd_disk->queue;
+ device->ldev->backing_bdev->bd_disk->queue;
if (b->merge_bvec_fn) {
backing_limit = b->merge_bvec_fn(b, bvm, bvec);
limit = min(limit, backing_limit);
}
- put_ldev(mdev);
+ put_ldev(device);
+ if ((limit >> 9) > max_hw_sectors)
+ limit = max_hw_sectors << 9;
}
return limit;
}
+
+static void find_oldest_requests(
+ struct drbd_connection *connection,
+ struct drbd_device *device,
+ struct drbd_request **oldest_req_waiting_for_peer,
+ struct drbd_request **oldest_req_waiting_for_disk)
+{
+ struct drbd_request *r;
+ *oldest_req_waiting_for_peer = NULL;
+ *oldest_req_waiting_for_disk = NULL;
+ list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!*oldest_req_waiting_for_peer
+ && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
+ *oldest_req_waiting_for_peer = r;
+
+ if (!*oldest_req_waiting_for_disk
+ && (s & RQ_LOCAL_PENDING) && r->device == device)
+ *oldest_req_waiting_for_disk = r;
+
+ if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
+ break;
+ }
+}
+
+void request_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req_disk, *req_peer; /* oldest request */
+ struct net_conf *nc;
+ unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+ unsigned long now;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+ ent = nc->timeout * HZ/10 * nc->ko_count;
+
+ if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+ put_ldev(device);
+ }
+ rcu_read_unlock();
+
+ et = min_not_zero(dt, ent);
+
+ if (!et)
+ return; /* Recurring timer stopped */
+
+ now = jiffies;
+
+ spin_lock_irq(&device->resource->req_lock);
+ find_oldest_requests(connection, device, &req_peer, &req_disk);
+ if (req_peer == NULL && req_disk == NULL) {
+ spin_unlock_irq(&device->resource->req_lock);
+ mod_timer(&device->request_timer, now + et);
+ return;
+ }
+
+ /* The request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ * with above state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ * resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ * for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ * !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+ if (ent && req_peer &&
+ time_after(now, req_peer->start_time + ent) &&
+ !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
+ drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+ _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+ }
+ if (dt && req_disk &&
+ time_after(now, req_disk->start_time + dt) &&
+ !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+ drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+ __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+ }
+
+ /* Reschedule timer for the nearest not already expired timeout.
+ * Fallback to now + min(effective network timeout, disk timeout). */
+ ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
+ ? req_peer->start_time + ent : now + et;
+ dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
+ ? req_disk->start_time + dt : now + et;
+ nt = time_before(ent, dt) ? ent : dt;
+ spin_unlock_irq(&connection->resource->req_lock);
+ mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index ab2bd09d54b..8566cd5866b 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -30,7 +30,6 @@
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
-#include "drbd_wrappers.h"
/* The request callbacks will be called in irq context by the IDE drivers,
and in Softirqs/Tasklets/BH context by the SCSI drivers,
@@ -77,37 +76,52 @@
*/
enum drbd_req_event {
- created,
- to_be_send,
- to_be_submitted,
+ CREATED,
+ TO_BE_SENT,
+ TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
- * these two are not "events" but "actions"
+ * these are not "events" but "actions"
* oh, well... */
- queue_for_net_write,
- queue_for_net_read,
-
- send_canceled,
- send_failed,
- handed_over_to_network,
- connection_lost_while_pending,
- read_retry_remote_canceled,
- recv_acked_by_peer,
- write_acked_by_peer,
- write_acked_by_peer_and_sis, /* and set_in_sync */
- conflict_discarded_by_peer,
- neg_acked,
- barrier_acked, /* in protocol A and B */
- data_received, /* (remote read) */
-
- read_completed_with_error,
- read_ahead_completed_with_error,
- write_completed_with_error,
- completed_ok,
- resend,
- fail_frozen_disk_io,
- restart_frozen_disk_io,
- nothing, /* for tracing only */
+ QUEUE_FOR_NET_WRITE,
+ QUEUE_FOR_NET_READ,
+ QUEUE_FOR_SEND_OOS,
+
+ /* An empty flush is queued as P_BARRIER,
+ * which will cause it to complete "successfully",
+ * even if the local disk flush failed.
+ *
+ * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+ * only see an error if neither local nor remote data is reachable. */
+ QUEUE_AS_DRBD_BARRIER,
+
+ SEND_CANCELED,
+ SEND_FAILED,
+ HANDED_OVER_TO_NETWORK,
+ OOS_HANDED_TO_NETWORK,
+ CONNECTION_LOST_WHILE_PENDING,
+ READ_RETRY_REMOTE_CANCELED,
+ RECV_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+ CONFLICT_RESOLVED,
+ POSTPONE_WRITE,
+ NEG_ACKED,
+ BARRIER_ACKED, /* in protocol A and B */
+ DATA_RECEIVED, /* (remote read) */
+
+ COMPLETED_OK,
+ READ_COMPLETED_WITH_ERROR,
+ READ_AHEAD_COMPLETED_WITH_ERROR,
+ WRITE_COMPLETED_WITH_ERROR,
+ DISCARD_COMPLETED_NOTSUPP,
+ DISCARD_COMPLETED_WITH_ERROR,
+
+ ABORT_DISK_IO,
+ RESEND,
+ FAIL_FROZEN_DISK_IO,
+ RESTART_FROZEN_DISK_IO,
+ NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
@@ -116,18 +130,21 @@ enum drbd_req_event {
* same time, so we should hold the request lock anyways.
*/
enum drbd_req_state_bits {
- /* 210
- * 000: no local possible
- * 001: to be submitted
+ /* 3210
+ * 0000: no local possible
+ * 0001: to be submitted
* UNUSED, we could map: 011: submitted, completion still pending
- * 110: completed ok
- * 010: completed with error
+ * 0110: completed ok
+ * 0010: completed with error
+ * 1001: Aborted (before completion)
+ * 1x10: Aborted and completed -> free
*/
__RQ_LOCAL_PENDING,
__RQ_LOCAL_COMPLETED,
__RQ_LOCAL_OK,
+ __RQ_LOCAL_ABORTED,
- /* 76543
+ /* 87654
* 00000: no network possible
* 00001: to be send
* 00011: to be send, on worker queue
@@ -136,8 +153,8 @@ enum drbd_req_state_bits {
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
- * 11100: write_acked (C),
- * data_received (for remote read, any protocol)
+ * 11100: write acked (C),
+ * data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
@@ -192,13 +209,30 @@ enum drbd_req_state_bits {
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
+
+ /* The peer has sent a retry ACK */
+ __RQ_POSTPONED,
+
+ /* would have been completed,
+ * but was not, because of drbd_suspended() */
+ __RQ_COMPLETION_SUSP,
+
+ /* We expect a receive ACK (wire proto B) */
+ __RQ_EXP_RECEIVE_ACK,
+
+ /* We expect a write ACK (wite proto C) */
+ __RQ_EXP_WRITE_ACK,
+
+ /* waiting for a barrier ack, did an extra kref_get */
+ __RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
-#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
@@ -212,56 +246,16 @@ enum drbd_req_state_bits {
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
-#define MR_WRITE_SHIFT 0
-#define MR_WRITE (1 << MR_WRITE_SHIFT)
-#define MR_READ_SHIFT 1
-#define MR_READ (1 << MR_READ_SHIFT)
-
-/* epoch entries */
-static inline
-struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- BUG_ON(mdev->ee_hash_s == 0);
- return mdev->ee_hash +
- ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
-}
-
-/* transfer log (drbd_request objects) */
-static inline
-struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- BUG_ON(mdev->tl_hash_s == 0);
- return mdev->tl_hash +
- ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
-}
-
-/* application reads (drbd_request objects) */
-static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- return mdev->app_reads_hash
- + ((unsigned int)(sector) % APP_R_HSIZE);
-}
-
-/* when we receive the answer for a read request,
- * verify that we actually know about it */
-static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
- u64 id, sector_t sector)
-{
- struct hlist_head *slot = ar_hash_slot(mdev, sector);
- struct hlist_node *n;
- struct drbd_request *req;
-
- hlist_for_each_entry(req, n, slot, colision) {
- if ((unsigned long)req == (unsigned long)id) {
- D_ASSERT(req->sector == sector);
- return req;
- }
- }
- return NULL;
-}
+#define MR_WRITE 1
+#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
@@ -271,90 +265,86 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
req->private_bio = bio;
bio->bi_private = req;
- bio->bi_end_io = drbd_endio_pri;
+ bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
-static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
- struct bio *bio_src)
-{
- struct drbd_request *req =
- mempool_alloc(drbd_request_mempool, GFP_NOIO);
- if (likely(req)) {
- drbd_req_make_private_bio(req, bio_src);
-
- req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
- req->mdev = mdev;
- req->master_bio = bio_src;
- req->epoch = 0;
- req->sector = bio_src->bi_sector;
- req->size = bio_src->bi_size;
- req->start_time = jiffies;
- INIT_HLIST_NODE(&req->colision);
- INIT_LIST_HEAD(&req->tl_requests);
- INIT_LIST_HEAD(&req->w.list);
- }
- return req;
-}
-
-static inline void drbd_req_free(struct drbd_request *req)
-{
- mempool_free(req, drbd_request_mempool);
-}
-
-static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
-{
- return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
-}
-
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
- * bio->bi_size, or similar. But that would be too ugly. */
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
struct bio_and_error {
struct bio *bio;
int error;
};
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
-extern void complete_master_bio(struct drbd_conf *mdev,
+extern void complete_master_bio(struct drbd_device *device,
struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_device *device = req->device;
struct bio_and_error m;
int rv;
/* __req_mod possibly frees req, do not touch req after that! */
rv = __req_mod(req, what, &m);
if (m.bio)
- complete_master_bio(mdev, &m);
+ complete_master_bio(device, &m);
return rv;
}
-/* completion of master bio is outside of spinlock.
- * If you need it irqsave, do it your self!
- * Which means: don't use from bio endio callback. */
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
- struct drbd_conf *mdev = req->mdev;
+ unsigned long flags;
+ struct drbd_device *device = req->device;
struct bio_and_error m;
int rv;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
rv = __req_mod(req, what, &m);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
if (m.bio)
- complete_master_bio(mdev, &m);
+ complete_master_bio(device, &m);
return rv;
}
+
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
+{
+ return s.pdsk == D_UP_TO_DATE ||
+ (s.pdsk >= D_INCONSISTENT &&
+ s.conn >= C_WF_BITMAP_T &&
+ s.conn < C_AHEAD);
+ /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+ That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+ states. */
+}
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+ return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+ /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+ since we enter state C_AHEAD only if proto >= 96 */
+}
+
#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 00000000000..a5d8aae00e0
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1884 @@
+/*
+ drbd_state.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+struct after_state_chg_work {
+ struct drbd_work w;
+ struct drbd_device *device;
+ union drbd_state os;
+ union drbd_state ns;
+ enum chg_state_flags flags;
+ struct completion *done;
+};
+
+enum sanitize_state_warnings {
+ NO_WARNING,
+ ABORTED_ONLINE_VERIFY,
+ ABORTED_RESYNC,
+ CONNECTION_LOST_NEGOTIATING,
+ IMPLICITLY_UPGRADED_DISK,
+ IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = true;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.disk != D_DISKLESS ||
+ device->state.conn != C_STANDALONE ||
+ device->state.role != R_SECONDARY) {
+ rv = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+ they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+ return R_PRIMARY;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+ return R_UNKNOWN;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+ enum drbd_role role = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ role = max_role(role, device->state.role);
+ }
+ rcu_read_unlock();
+
+ return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+ enum drbd_role peer = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ peer = max_role(peer, device->state.peer);
+ }
+ rcu_read_unlock();
+
+ return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state ds = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ ds = max_t(enum drbd_disk_state, ds, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state ds = D_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ ds = min_t(enum drbd_disk_state, ds, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state ds = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ ds = max_t(enum drbd_disk_state, ds, device->state.pdsk);
+ }
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+ enum drbd_conns conn = C_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ conn = min_t(enum drbd_conns, conn, device->state.conn);
+ }
+ rcu_read_unlock();
+
+ return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+ bool rv = true;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+ rv = false;
+ break;
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device: DRBD device.
+ * @os: old (current) state.
+ * @ns: new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+ union drbd_state os, union drbd_state ns)
+{
+ return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+ ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+ (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+ (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+ (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+ (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+ (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+ union drbd_state ns;
+ ns.i = (os.i & ~mask.i) | val.i;
+ return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val)
+{
+ unsigned long flags;
+ union drbd_state ns;
+ enum drbd_state_rv rv;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, NULL);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+ union drbd_state mask, union drbd_state val)
+{
+ drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val)
+{
+ union drbd_state os, ns;
+ unsigned long flags;
+ enum drbd_state_rv rv;
+
+ if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+ if (!cl_wide_st_chg(device, os, ns))
+ rv = SS_CW_NO_NEED;
+ if (rv == SS_UNKNOWN_ERROR) {
+ rv = is_valid_state(device, ns);
+ if (rv >= SS_SUCCESS) {
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+ }
+ }
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ struct completion done;
+ unsigned long flags;
+ union drbd_state os, ns;
+ enum drbd_state_rv rv;
+
+ init_completion(&done);
+
+ if (f & CS_SERIALIZE)
+ mutex_lock(device->state_mutex);
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS) {
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ goto abort;
+ }
+
+ if (cl_wide_st_chg(device, os, ns)) {
+ rv = is_valid_state(device, ns);
+ if (rv == SS_SUCCESS)
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+ rv = SS_CW_FAILED_BY_PEER;
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ wait_event(device->state_wait,
+ (rv = _req_st_cond(device, mask, val)));
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, &done);
+ } else {
+ rv = _drbd_set_state(device, ns, f, &done);
+ }
+
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+ wait_for_completion(&done);
+ }
+
+abort:
+ if (f & CS_SERIALIZE)
+ mutex_unlock(device->state_mutex);
+
+ return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ wait_event(device->state_wait,
+ (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+ return rv;
+}
+
+static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+{
+ drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+ name,
+ drbd_conn_str(ns.conn),
+ drbd_role_str(ns.role),
+ drbd_role_str(ns.peer),
+ drbd_disk_str(ns.disk),
+ drbd_disk_str(ns.pdsk),
+ is_susp(ns) ? 's' : 'r',
+ ns.aftr_isp ? 'a' : '-',
+ ns.peer_isp ? 'p' : '-',
+ ns.user_isp ? 'u' : '-',
+ ns.susp_fen ? 'F' : '-',
+ ns.susp_nod ? 'N' : '-'
+ );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum drbd_state_rv err)
+{
+ if (err == SS_IN_TRANSIENT_STATE)
+ return;
+ drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+ print_st(device, " state", os);
+ print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char *pbp;
+ pbp = pb;
+ *pbp = 0;
+
+ if (ns.role != os.role && flags & CS_DC_ROLE)
+ pbp += sprintf(pbp, "role( %s -> %s ) ",
+ drbd_role_str(os.role),
+ drbd_role_str(ns.role));
+ if (ns.peer != os.peer && flags & CS_DC_PEER)
+ pbp += sprintf(pbp, "peer( %s -> %s ) ",
+ drbd_role_str(os.peer),
+ drbd_role_str(ns.peer));
+ if (ns.conn != os.conn && flags & CS_DC_CONN)
+ pbp += sprintf(pbp, "conn( %s -> %s ) ",
+ drbd_conn_str(os.conn),
+ drbd_conn_str(ns.conn));
+ if (ns.disk != os.disk && flags & CS_DC_DISK)
+ pbp += sprintf(pbp, "disk( %s -> %s ) ",
+ drbd_disk_str(os.disk),
+ drbd_disk_str(ns.disk));
+ if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+ pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+ drbd_disk_str(os.pdsk),
+ drbd_disk_str(ns.pdsk));
+
+ return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+ if (ns.aftr_isp != os.aftr_isp)
+ pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+ os.aftr_isp,
+ ns.aftr_isp);
+ if (ns.peer_isp != os.peer_isp)
+ pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+ os.peer_isp,
+ ns.peer_isp);
+ if (ns.user_isp != os.user_isp)
+ pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+ os.user_isp,
+ ns.user_isp);
+
+ if (pbp != pb)
+ drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags);
+
+ if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+ pbp += sprintf(pbp, "susp( %d -> %d ) ",
+ is_susp(os),
+ is_susp(ns));
+
+ if (pbp != pb)
+ drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device: DRBD device.
+ * @ns: State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ enum drbd_fencing_p fp;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ }
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc) {
+ if (!nc->two_primaries && ns.role == R_PRIMARY) {
+ if (ns.peer == R_PRIMARY)
+ rv = SS_TWO_PRIMARIES;
+ else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+ rv = SS_O_VOL_PEER_PRI;
+ }
+ }
+
+ if (rv <= 0)
+ /* already found a reason to abort */;
+ else if (ns.role == R_SECONDARY && device->open_cnt)
+ rv = SS_DEVICE_IN_USE;
+
+ else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (fp >= FP_RESOURCE &&
+ ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+ rv = SS_PRIMARY_NOP;
+
+ else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+ rv = SS_NO_LOCAL_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+ rv = SS_NO_REMOTE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_CONNECTED ||
+ ns.conn == C_WF_BITMAP_S ||
+ ns.conn == C_SYNC_SOURCE ||
+ ns.conn == C_PAUSED_SYNC_S) &&
+ ns.disk == D_OUTDATED)
+ rv = SS_CONNECTED_OUTDATES;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ (nc->verify_alg[0] == 0))
+ rv = SS_NO_VERIFY_ALG;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ first_peer_device(device)->connection->agreed_pro_version < 88)
+ rv = SS_NOT_SUPPORTED;
+
+ else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ ns.pdsk == D_UNKNOWN)
+ rv = SS_NEED_CONNECTION;
+
+ else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+ rv = SS_CONNECTED_OUTDATES;
+
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @device: DRBD device.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+
+ if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+ os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+ rv = SS_ALREADY_STANDALONE;
+
+ if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+ rv = SS_NO_NET_CONFIG;
+
+ if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+ rv = SS_LOWER_THAN_OUTDATED;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+ rv = SS_IN_TRANSIENT_STATE; */
+
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... */
+ if (test_bit(STATE_SENT, &connection->flags) &&
+ !(os.conn == C_WF_REPORT_PARAMS ||
+ (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ ns.conn != os.conn && os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+ && os.conn < C_WF_REPORT_PARAMS)
+ rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+ if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+ os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+ rv = SS_OUTDATE_WO_CONN;
+
+ return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+ /* no change -> nothing to do, at least for the connection part */
+ if (oc == nc)
+ return SS_NOTHING_TO_DO;
+
+ /* disconnect of an unconfigured connection does not make sense */
+ if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+ return SS_ALREADY_STANDALONE;
+
+ /* from C_STANDALONE, we start with C_UNCONNECTED */
+ if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* When establishing a connection we need to go through WF_REPORT_PARAMS!
+ Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+ if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+ if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+ return SS_IN_TRANSIENT_STATE;
+
+ /* After C_DISCONNECTING only C_STANDALONE may follow */
+ if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+ return SS_IN_TRANSIENT_STATE;
+
+ return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+ enum drbd_state_rv rv;
+
+ rv = is_valid_conn_transition(os.conn, ns.conn);
+
+ /* we cannot fail (again) if we already detached */
+ if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+ static const char *msg_table[] = {
+ [NO_WARNING] = "",
+ [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+ [ABORTED_RESYNC] = "Resync aborted.",
+ [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+ [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+ [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+ };
+
+ if (warn != NO_WARNING)
+ drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+ enum drbd_fencing_p fp;
+ enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+ if (warn)
+ *warn = NO_WARNING;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ rcu_read_unlock();
+ put_ldev(device);
+ }
+
+ /* Implications from connection to peer and peer_isp */
+ if (ns.conn < C_CONNECTED) {
+ ns.peer_isp = 0;
+ ns.peer = R_UNKNOWN;
+ if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+ ns.pdsk = D_UNKNOWN;
+ }
+
+ /* Clear the aftr_isp when becoming unconfigured */
+ if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+ ns.aftr_isp = 0;
+
+ /* An implication of the disk states onto the connection state */
+ /* Abort resync if a disk fails/detaches */
+ if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+ if (warn)
+ *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+ ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+ ns.conn = C_CONNECTED;
+ }
+
+ /* Connection breaks down before we finished "Negotiating" */
+ if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
+ if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+ ns.disk = device->new_state_tmp.disk;
+ ns.pdsk = device->new_state_tmp.pdsk;
+ } else {
+ if (warn)
+ *warn = CONNECTION_LOST_NEGOTIATING;
+ ns.disk = D_DISKLESS;
+ ns.pdsk = D_UNKNOWN;
+ }
+ put_ldev(device);
+ }
+
+ /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+ if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+ if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+ ns.disk = D_UP_TO_DATE;
+ if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+ ns.pdsk = D_UP_TO_DATE;
+ }
+
+ /* Implications of the connection stat on the disk states */
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_UNKNOWN;
+ switch ((enum drbd_conns)ns.conn) {
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ case C_STARTING_SYNC_T:
+ case C_WF_SYNC_UUID:
+ case C_BEHIND:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_OUTDATED;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_CONNECTED:
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_DISKLESS;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_WF_BITMAP_S:
+ case C_PAUSED_SYNC_S:
+ case C_STARTING_SYNC_S:
+ case C_AHEAD:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+ break;
+ case C_SYNC_TARGET:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_INCONSISTENT;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_SYNC_SOURCE:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_INCONSISTENT;
+ break;
+ case C_STANDALONE:
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_CONNECTION:
+ case C_WF_REPORT_PARAMS:
+ case C_MASK:
+ break;
+ }
+ if (ns.disk > disk_max)
+ ns.disk = disk_max;
+
+ if (ns.disk < disk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_DISK;
+ ns.disk = disk_min;
+ }
+ if (ns.pdsk > pdsk_max)
+ ns.pdsk = pdsk_max;
+
+ if (ns.pdsk < pdsk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_PDSK;
+ ns.pdsk = pdsk_min;
+ }
+
+ if (fp == FP_STONITH &&
+ (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+ !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+ !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+ if (ns.conn == C_SYNC_SOURCE)
+ ns.conn = C_PAUSED_SYNC_S;
+ if (ns.conn == C_SYNC_TARGET)
+ ns.conn = C_PAUSED_SYNC_T;
+ } else {
+ if (ns.conn == C_PAUSED_SYNC_S)
+ ns.conn = C_SYNC_SOURCE;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ ns.conn = C_SYNC_TARGET;
+ }
+
+ return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+ drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+ if (first_peer_device(device)->connection->agreed_pro_version < 90)
+ device->ov_start_sector = 0;
+ device->rs_total = drbd_bm_bits(device);
+ device->ov_position = 0;
+ if (cs == C_VERIFY_T) {
+ /* starting online verify from an arbitrary position
+ * does not fit well into the existing protocol.
+ * on C_VERIFY_T, we initialize ov_left and friends
+ * implicitly in receive_DataRequest once the
+ * first P_OV_REQUEST is received */
+ device->ov_start_sector = ~(sector_t)0;
+ } else {
+ unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+ if (bit >= device->rs_total) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(device->rs_total - 1);
+ device->rs_total = 1;
+ } else
+ device->rs_total -= bit;
+ device->ov_position = device->ov_start_sector;
+ }
+ device->ov_left = device->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @device: DRBD device.
+ * @ns: new state.
+ * @flags: Flags
+ * @done: Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ union drbd_state os;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ enum sanitize_state_warnings ssw;
+ struct after_state_chg_work *ascw;
+ bool did_remote, should_do_remote;
+
+ os = drbd_read_state(device);
+
+ ns = sanitize_state(device, os, ns, &ssw);
+ if (ns.i == os.i)
+ return SS_NOTHING_TO_DO;
+
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS)
+ return rv;
+
+ if (!(flags & CS_HARD)) {
+ /* pre-state-change checks ; only look at ns */
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ /* If the old state was illegal as well, then let
+ this happen...*/
+
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ return rv;
+ }
+
+ print_sanitize_warnings(device, ssw);
+
+ drbd_pr_state_change(device, os, ns, flags);
+
+ /* Display changes to the susp* flags that where caused by the call to
+ sanitize_state(). Only display it here if we where not called from
+ _conn_request_state() */
+ if (!(flags & CS_DC_SUSP))
+ conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+ (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+ /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+ * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+ * drbd_ldev_destroy() won't happen before our corresponding
+ * after_state_ch works run, where we put_ldev again. */
+ if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+ atomic_inc(&device->local_cnt);
+
+ did_remote = drbd_should_do_remote(device->state);
+ device->state.i = ns.i;
+ should_do_remote = drbd_should_do_remote(device->state);
+ device->resource->susp = ns.susp;
+ device->resource->susp_nod = ns.susp_nod;
+ device->resource->susp_fen = ns.susp_fen;
+
+ /* put replicated vs not-replicated requests in seperate epochs */
+ if (did_remote != should_do_remote)
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+ drbd_print_uuids(device, "attached to UUIDs");
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+ no_peer_wf_report_params(first_peer_device(device)->connection))
+ clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+
+ wake_up(&device->misc_wait);
+ wake_up(&device->state_wait);
+ wake_up(&first_peer_device(device)->connection->ping_wait);
+
+ /* Aborted verify run, or we reached the stop sector.
+ * Log the last position, unless end-of-device. */
+ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+ ns.conn <= C_CONNECTED) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+ if (device->ov_left)
+ drbd_info(device, "Online Verify reached sector %llu\n",
+ (unsigned long long)device->ov_start_sector);
+ }
+
+ if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
+ drbd_info(device, "Syncer continues.\n");
+ device->rs_paused += (long)jiffies
+ -(long)device->rs_mark_time[device->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&device->resync_timer, jiffies);
+ }
+
+ if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
+ (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+ drbd_info(device, "Resync suspended\n");
+ device->rs_mark_time[device->rs_last_mark] = jiffies;
+ }
+
+ if (os.conn == C_CONNECTED &&
+ (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
+ set_ov_position(device, ns.conn);
+ device->rs_start = now;
+ device->rs_last_events = 0;
+ device->rs_last_sect_ev = 0;
+ device->ov_last_oos_size = 0;
+ device->ov_last_oos_start = 0;
+
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+
+ drbd_rs_controller_reset(device);
+
+ if (ns.conn == C_VERIFY_S) {
+ drbd_info(device, "Starting Online Verify from sector %llu\n",
+ (unsigned long long)device->ov_position);
+ mod_timer(&device->resync_timer, jiffies);
+ }
+ }
+
+ if (get_ldev(device)) {
+ u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+ MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+ MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+ mdf &= ~MDF_AL_CLEAN;
+ if (test_bit(CRASHED_PRIMARY, &device->flags))
+ mdf |= MDF_CRASHED_PRIMARY;
+ if (device->state.role == R_PRIMARY ||
+ (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+ mdf |= MDF_PRIMARY_IND;
+ if (device->state.conn > C_WF_REPORT_PARAMS)
+ mdf |= MDF_CONNECTED_IND;
+ if (device->state.disk > D_INCONSISTENT)
+ mdf |= MDF_CONSISTENT;
+ if (device->state.disk > D_OUTDATED)
+ mdf |= MDF_WAS_UP_TO_DATE;
+ if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+ mdf |= MDF_PEER_OUT_DATED;
+ if (mdf != device->ldev->md.flags) {
+ device->ldev->md.flags = mdf;
+ drbd_md_mark_dirty(device);
+ }
+ if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+ drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+ put_ldev(device);
+ }
+
+ /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+ if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+ os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+ set_bit(CONSIDER_RESYNC, &device->flags);
+
+ /* Receiver should clean up itself */
+ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+ drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+
+ /* Now the receiver finished cleaning up itself, it should die */
+ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+ drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+
+ /* Upon network failure, we need to restart the receiver. */
+ if (os.conn > C_WF_CONNECTION &&
+ ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+ drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ drbd_resume_al(device);
+ first_peer_device(device)->connection->connect_cnt++;
+ }
+
+ /* remember last attach time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ device->last_reattach_jif = jiffies;
+
+ ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+ if (ascw) {
+ ascw->os = os;
+ ascw->ns = ns;
+ ascw->flags = flags;
+ ascw->w.cb = w_after_state_ch;
+ ascw->device = device;
+ ascw->done = done;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &ascw->w);
+ } else {
+ drbd_err(device, "Could not kmalloc an ascw\n");
+ }
+
+ return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_state_chg_work *ascw =
+ container_of(w, struct after_state_chg_work, w);
+ struct drbd_device *device = ascw->device;
+
+ after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+ if (ascw->flags & CS_WAIT_COMPLETE)
+ complete(ascw->done);
+ kfree(ascw);
+
+ return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+ if (rv) {
+ drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+ _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+ return;
+ }
+
+ switch (device->state.conn) {
+ case C_STARTING_SYNC_T:
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ break;
+ case C_STARTING_SYNC_S:
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ break;
+ }
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ /* open coded non-blocking drbd_suspend_io(device); */
+ set_bit(SUSPEND_IO, &device->flags);
+
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
+
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @flags: Flags
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags)
+{
+ struct drbd_resource *resource = device->resource;
+ struct sib_info sib;
+
+ sib.sib_reason = SIB_STATE_CHANGE;
+ sib.os = os;
+ sib.ns = ns;
+
+ if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+ clear_bit(CRASHED_PRIMARY, &device->flags);
+ if (device->p_uuid)
+ device->p_uuid[UI_FLAGS] &= ~((u64)2);
+ }
+
+ /* Inform userspace about the change... */
+ drbd_bcast_event(device, &sib);
+
+ if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ drbd_khelper(device, "pri-on-incon-degr");
+
+ /* Here we have the actions that are performed after a
+ state change. This function might sleep */
+
+ if (ns.susp_nod) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ enum drbd_req_event what = NOTHING;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+ what = RESEND;
+
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ conn_lowest_disk(connection) > D_NEGOTIATING)
+ what = RESTART_FROZEN_DISK_IO;
+
+ if (resource->susp_nod && what != NOTHING) {
+ _tl_restart(connection, what);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_nod = 1 } },
+ (union drbd_state) { { .susp_nod = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ if (ns.susp_fen) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+ /* case2: The connection was established again: */
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+ rcu_read_unlock();
+ _tl_restart(connection, RESEND);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ /* Became sync source. With protocol >= 96, we still need to send out
+ * the sync uuid now. Need to do that before any drbd_send_state, or
+ * the other side may go "paused sync" before receiving the sync uuids,
+ * which is unexpected. */
+ if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+ first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
+ drbd_gen_and_send_sync_uuid(first_peer_device(device));
+ put_ldev(device);
+ }
+
+ /* Do not change the order of the if above and the two below... */
+ if (os.pdsk == D_DISKLESS &&
+ ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(device);
+
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_state(first_peer_device(device), ns);
+ }
+ /* No point in queuing send_bitmap if we don't have a connection
+ * anymore, so check also the _current_ state, not only the new state
+ * at the time this work was queued. */
+ if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+ device->state.conn == C_WF_BITMAP_S)
+ drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+ "send_bitmap (WFBitMapS)",
+ BM_LOCKED_TEST_ALLOWED);
+
+ /* Lost contact to peer's copy of the data */
+ if ((os.pdsk >= D_INCONSISTENT &&
+ os.pdsk != D_UNKNOWN &&
+ os.pdsk != D_OUTDATED)
+ && (ns.pdsk < D_INCONSISTENT ||
+ ns.pdsk == D_UNKNOWN ||
+ ns.pdsk == D_OUTDATED)) {
+ if (get_ldev(device)) {
+ if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ if (drbd_suspended(device)) {
+ set_bit(NEW_CUR_UUID, &device->flags);
+ } else {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(first_peer_device(device));
+ }
+ }
+ put_ldev(device);
+ }
+ }
+
+ if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+ if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(first_peer_device(device));
+ }
+ /* D_DISKLESS Peer becomes secondary */
+ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+ /* We may still be Primary ourselves.
+ * No harm done if the bitmap still changes,
+ * redirtied pages will follow later. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote diskless peer", BM_LOCKED_SET_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Write out all changed bits on demote.
+ * Though, no need to da that just yet
+ * if there is a resync going on still */
+ if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+ device->state.conn <= C_CONNECTED && get_ldev(device)) {
+ /* No changes to the bitmap expected this time, so assert that,
+ * even though no harm was done if it did change. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote", BM_LOCKED_TEST_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Last part of the attaching process ... */
+ if (ns.conn >= C_CONNECTED &&
+ os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+ drbd_send_sizes(first_peer_device(device), 0, 0); /* to start sync... */
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_state(first_peer_device(device), ns);
+ }
+
+ /* We want to pause/continue resync, tell peer. */
+ if (ns.conn >= C_CONNECTED &&
+ ((os.aftr_isp != ns.aftr_isp) ||
+ (os.user_isp != ns.user_isp)))
+ drbd_send_state(first_peer_device(device), ns);
+
+ /* In case one of the isp bits got set, suspend other devices. */
+ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+ (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+ suspend_other_sg(device);
+
+ /* Make sure the peer gets informed about eventual state
+ changes (ISP bits) while we were in WFReportParams. */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+ drbd_send_state(first_peer_device(device), ns);
+
+ if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+ drbd_send_state(first_peer_device(device), ns);
+
+ /* We are in the progress to start a full sync... */
+ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+ /* no other bitmap changes expected during this phase */
+ drbd_queue_bitmap_io(device,
+ &drbd_bmio_set_n_write, &abw_start_sync,
+ "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+ /* first half of local IO error, failure to attach,
+ * or administrative detach */
+ if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
+ /* corresponding get_ldev was in __drbd_set_state, to serialize
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (device->ldev) {
+ rcu_read_lock();
+ eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(device, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+ tl_abort_disk_io(device);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (device->state.disk != D_FAILED)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(first_peer_device(device), ns);
+
+ drbd_rs_cancel_all(device);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(device);
+ }
+ put_ldev(device);
+ }
+
+ /* second half of local IO error, failure to attach,
+ * or administrative detach,
+ * after local_cnt references have reached zero again */
+ if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (device->state.disk != D_DISKLESS)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(first_peer_device(device), ns);
+ /* corresponding get_ldev in __drbd_set_state
+ * this may finally trigger drbd_ldev_destroy. */
+ put_ldev(device);
+ }
+
+ /* Notify peer that I had a local IO error, and did not detached.. */
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(first_peer_device(device), ns);
+
+ /* Disks got bigger while they were detached */
+ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+ test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+ if (ns.conn == C_CONNECTED)
+ resync_after_online_grow(device);
+ }
+
+ /* A resync finished or aborted, wake paused devices... */
+ if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+ (os.peer_isp && !ns.peer_isp) ||
+ (os.user_isp && !ns.user_isp))
+ resume_next_sg(device);
+
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(first_peer_device(device), ns);
+
+ /* Verify finished, or reached stop sector. Peer did not know about
+ * the stop sector, and we may even have changed the stop sector during
+ * verify to interrupt/stop early. Send the new state. */
+ if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+ && verify_can_do_stop_sector(device))
+ drbd_send_state(first_peer_device(device), ns);
+
+ /* This triggers bitmap writeout of potentially still unwritten pages
+ * if the resync finished cleanly, or aborted because of peer disk
+ * failure, or because of connection loss.
+ * For resync aborted because of local disk failure, we cannot do
+ * any bitmap writeout anymore.
+ * No harm done if some bits change during this phase.
+ */
+ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+ drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+ put_ldev(device);
+ }
+
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY) {
+ if (os.aftr_isp != ns.aftr_isp)
+ resume_next_sg(device);
+ }
+
+ drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+ struct drbd_work w;
+ enum drbd_conns oc;
+ union drbd_state ns_min;
+ union drbd_state ns_max; /* new, max state, over all devices */
+ enum chg_state_flags flags;
+ struct drbd_connection *connection;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_conn_state_chg_work *acscw =
+ container_of(w, struct after_conn_state_chg_work, w);
+ struct drbd_connection *connection = acscw->connection;
+ enum drbd_conns oc = acscw->oc;
+ union drbd_state ns_max = acscw->ns_max;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ kfree(acscw);
+
+ /* Upon network configuration, we need to start the receiver */
+ if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+ drbd_thread_start(&connection->receiver);
+
+ if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+ struct net_conf *old_conf;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_conf = connection->net_conf;
+ connection->my_addr_len = 0;
+ connection->peer_addr_len = 0;
+ rcu_assign_pointer(connection->net_conf, NULL);
+ conn_free_crypto(connection);
+ mutex_unlock(&connection->resource->conf_update);
+
+ synchronize_rcu();
+ kfree(old_conf);
+ }
+
+ if (ns_max.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (ns_max.pdsk <= D_OUTDATED) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ }
+ rcu_read_unlock();
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ spin_unlock_irq(&connection->resource->req_lock);
+ }
+ }
+ kref_put(&connection->kref, drbd_destroy_connection);
+
+ conn_md_sync(connection);
+
+ return 0;
+}
+
+void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+ enum chg_state_flags flags = ~0;
+ struct drbd_peer_device *peer_device;
+ int vnr, first_vol = 1;
+ union drbd_dev_state os, cs = {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = connection->cstate,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = device->state;
+
+ if (first_vol) {
+ cs = os;
+ first_vol = 0;
+ continue;
+ }
+
+ if (cs.role != os.role)
+ flags &= ~CS_DC_ROLE;
+
+ if (cs.peer != os.peer)
+ flags &= ~CS_DC_PEER;
+
+ if (cs.conn != os.conn)
+ flags &= ~CS_DC_CONN;
+
+ if (cs.disk != os.disk)
+ flags &= ~CS_DC_DISK;
+
+ if (cs.pdsk != os.pdsk)
+ flags &= ~CS_DC_PDSK;
+ }
+ rcu_read_unlock();
+
+ *pf |= CS_DC_MASK;
+ *pf &= flags;
+ (*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ union drbd_state ns, os;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ if (ns.i == os.i)
+ continue;
+
+ rv = is_valid_transition(os, ns);
+
+ if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+ union drbd_state ns, os, ns_max = { };
+ union drbd_state ns_min = {
+ { .role = R_MASK,
+ .peer = R_MASK,
+ .conn = val.conn,
+ .disk = D_MASK,
+ .pdsk = D_MASK
+ } };
+ struct drbd_peer_device *peer_device;
+ enum drbd_state_rv rv;
+ int vnr, number_of_volumes = 0;
+
+ if (mask.conn == C_MASK) {
+ /* remember last connect time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+ connection->last_reconnect_jif = jiffies;
+
+ connection->cstate = val.conn;
+ }
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ number_of_volumes++;
+ os = drbd_read_state(device);
+ ns = apply_mask_val(os, mask, val);
+ ns = sanitize_state(device, os, ns, NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ rv = __drbd_set_state(device, ns, flags, NULL);
+ if (rv < SS_SUCCESS)
+ BUG();
+
+ ns.i = device->state.i;
+ ns_max.role = max_role(ns.role, ns_max.role);
+ ns_max.peer = max_role(ns.peer, ns_max.peer);
+ ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+ ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+ ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+ ns_min.role = min_role(ns.role, ns_min.role);
+ ns_min.peer = min_role(ns.peer, ns_min.peer);
+ ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+ ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+ ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+ }
+ rcu_read_unlock();
+
+ if (number_of_volumes == 0) {
+ ns_min = ns_max = (union drbd_state) { {
+ .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = val.conn,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN
+ } };
+ }
+
+ ns_min.susp = ns_max.susp = connection->resource->susp;
+ ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+ ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+ *pns_min = ns_min;
+ *pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+ rv = SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+ rv = SS_CW_FAILED_BY_PEER;
+
+ err = conn_is_valid_transition(connection, mask, val, 0);
+ if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+ return rv;
+
+ return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct after_conn_state_chg_work *acscw;
+ enum drbd_conns oc = connection->cstate;
+ union drbd_state ns_max, ns_min, os;
+ bool have_mutex = false;
+
+ if (mask.conn) {
+ rv = is_valid_conn_transition(oc, val.conn);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ rv = conn_is_valid_transition(connection, mask, val, flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+
+ if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+ !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+ /* This will be a cluster-wide state change.
+ * Need to give up the spinlock, grab the mutex,
+ * then send the state change request, ... */
+ spin_unlock_irq(&connection->resource->req_lock);
+ mutex_lock(&connection->cstate_mutex);
+ have_mutex = true;
+
+ set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (conn_send_state_req(connection, mask, val)) {
+ /* sending failed. */
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ rv = SS_CW_FAILED_BY_PEER;
+ /* need to re-aquire the spin lock, though */
+ goto abort_unlocked;
+ }
+
+ if (val.conn == C_DISCONNECTING)
+ set_bit(DISCONNECT_SENT, &connection->flags);
+
+ /* ... and re-aquire the spinlock.
+ * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+ * conn_set_state() within the same spinlock. */
+ spin_lock_irq(&connection->resource->req_lock);
+ wait_event_lock_irq(connection->ping_wait,
+ (rv = _conn_rq_cond(connection, mask, val)),
+ connection->resource->req_lock);
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ conn_old_common_state(connection, &os, &flags);
+ flags |= CS_DC_SUSP;
+ conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+ conn_pr_state_change(connection, os, ns_max, flags);
+
+ acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+ if (acscw) {
+ acscw->oc = os.conn;
+ acscw->ns_min = ns_min;
+ acscw->ns_max = ns_max;
+ acscw->flags = flags;
+ acscw->w.cb = w_after_conn_state_ch;
+ kref_get(&connection->kref);
+ acscw->connection = connection;
+ drbd_queue_work(&connection->sender_work, &acscw->w);
+ } else {
+ drbd_err(connection, "Could not kmalloc an acscw\n");
+ }
+
+ abort:
+ if (have_mutex) {
+ /* mutex_unlock() "... must not be used in interrupt context.",
+ * so give up the spinlock, then re-aquire it */
+ spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+ mutex_unlock(&connection->cstate_mutex);
+ spin_lock_irq(&connection->resource->req_lock);
+ }
+ if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+ drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+ drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+ drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+ }
+ return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ rv = _conn_request_state(connection, mask, val, flags);
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 00000000000..cc41605ba21
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,161 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+ CS_HARD = 1 << 0,
+ CS_VERBOSE = 1 << 1,
+ CS_WAIT_COMPLETE = 1 << 2,
+ CS_SERIALIZE = 1 << 3,
+ CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
+ CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
+ CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
+ CS_DC_PEER = 1 << 6,
+ CS_DC_CONN = 1 << 7,
+ CS_DC_DISK = 1 << 8,
+ CS_DC_PDSK = 1 << 9,
+ CS_DC_SUSP = 1 << 10,
+ CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+ CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+ small difference. There is no suspended flag (.susp), and no suspended
+ while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned _unused:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned peer_isp:1 ;
+ unsigned user_isp:1 ;
+ unsigned _pad:11; /* 0 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned _pad:11;
+ unsigned user_isp:1 ;
+ unsigned peer_isp:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned _unused:1 ;
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+ };
+ unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+ enum chg_state_flags f,
+ union drbd_state mask,
+ union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+ union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+ union drbd_state,
+ union drbd_state,
+ enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+ enum chg_state_flags,
+ struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+ union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+ union drbd_state mask,
+ union drbd_state val)
+{
+ return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 85179e1fb50..80b0f63c707 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -24,6 +24,7 @@
*/
#include <linux/drbd.h>
+#include "drbd_strings.h"
static const char *drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
@@ -48,6 +49,8 @@ static const char *drbd_conn_s_names[] = {
[C_PAUSED_SYNC_T] = "PausedSyncT",
[C_VERIFY_S] = "VerifyS",
[C_VERIFY_T] = "VerifyT",
+ [C_AHEAD] = "Ahead",
+ [C_BEHIND] = "Behind",
};
static const char *drbd_role_s_names[] = {
@@ -87,12 +90,14 @@ static const char *drbd_state_sw_errors[] = {
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+ [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+ [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)
{
/* enums are unsigned... */
- return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
+ return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
}
const char *drbd_role_str(enum drbd_role s)
@@ -105,7 +110,7 @@ const char *drbd_disk_str(enum drbd_disk_state s)
return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
}
-const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
{
return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
err > SS_TWO_PRIMARIES ? "TOO_LARGE"
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 00000000000..f9923cc88af
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,9 @@
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
index fc824006e72..8cb1532a381 100644
--- a/drivers/block/drbd/drbd_vli.h
+++ b/drivers/block/drbd/drbd_vli.h
@@ -32,7 +32,7 @@
* the bitmap transfer time can take much too long,
* if transmitted in plain text.
*
- * We try to reduce the transfered bitmap information
+ * We try to reduce the transferred bitmap information
* by encoding runlengths of bit polarity.
*
* We never actually need to encode a "zero" (runlengths are positive).
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 34f224b018b..d8f57b6305c 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -21,7 +21,7 @@
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
+*/
#include <linux/module.h>
#include <linux/drbd.h>
@@ -36,21 +36,18 @@
#include <linux/scatterlist.h>
#include "drbd_int.h"
+#include "drbd_protocol.h"
#include "drbd_req.h"
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
-
-
-
-/* defined here:
- drbd_md_io_complete
- drbd_endio_sec
- drbd_endio_pri
-
- * more endio handlers:
- atodb_endio in drbd_actlog.c
- drbd_bm_async_io_complete in drbd_bitmap.c
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+/* endio handlers:
+ * drbd_md_io_complete (defined here)
+ * drbd_request_endio (defined here)
+ * drbd_peer_request_endio (defined here)
+ * bm_async_io_complete (defined in drbd_bitmap.c)
+ *
* For all these callbacks, note the following:
* The callbacks will be called in irq context by the IDE drivers,
* and in Softirqs/Tasklets/BH context by the SCSI drivers.
@@ -61,7 +58,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
- to evaluate the sync after dependencies, we grab a write lock, because
+ to evaluate the resync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
@@ -71,106 +68,128 @@ rwlock_t global_state_lock;
void drbd_md_io_complete(struct bio *bio, int error)
{
struct drbd_md_io *md_io;
+ struct drbd_device *device;
md_io = (struct drbd_md_io *)bio->bi_private;
+ device = container_of(md_io, struct drbd_device, md_io);
+
md_io->error = error;
- complete(&md_io->event);
+ /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+ * to timeout on the lower level device, and eventually detach from it.
+ * If this io completion runs after that timeout expired, this
+ * drbd_md_put_buffer() may allow us to finally try and re-attach.
+ * During normal operation, this only puts that extra reference
+ * down to 1 again.
+ * Make sure we first drop the reference, and only then signal
+ * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+ * next drbd_md_sync_page_io(), that we trigger the
+ * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+ */
+ drbd_md_put_buffer(device);
+ md_io->done = 1;
+ wake_up(&device->misc_wait);
+ bio_put(bio);
+ if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+ put_ldev(device);
}
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
-void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
-
- D_ASSERT(e->block_id != ID_VACANT);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->read_cnt += e->size >> 9;
- list_del(&e->w.list);
- if (list_empty(&mdev->read_ee))
- wake_up(&mdev->ee_wait);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, FALSE);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- drbd_queue_work(&mdev->data.work, &e->w);
- put_ldev(mdev);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->read_cnt += peer_req->i.size >> 9;
+ list_del(&peer_req->w.list);
+ if (list_empty(&device->read_ee))
+ wake_up(&device->ee_wait);
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+ put_ldev(device);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
-static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
- sector_t e_sector;
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_interval i;
int do_wake;
- int is_syncer_req;
+ u64 block_id;
int do_al_complete_io;
- D_ASSERT(e->block_id != ID_VACANT);
-
- /* after we moved e to done_ee,
+ /* after we moved peer_req to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
- e_sector = e->sector;
- do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
- is_syncer_req = is_syncer_block_id(e->block_id);
+ i = peer_req->i;
+ do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+ block_id = peer_req->block_id;
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->writ_cnt += e->size >> 9;
- list_del(&e->w.list); /* has been on active_ee or sync_ee */
- list_add_tail(&e->w.list, &mdev->done_ee);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->writ_cnt += peer_req->i.size >> 9;
+ list_move_tail(&peer_req->w.list, &device->done_ee);
- /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
- * neither did we wake possibly waiting conflicting requests.
- * done from "drbd_process_done_ee" within the appropriate w.cb
- * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+ /*
+ * Do not remove from the write_requests tree here: we did not send the
+ * Ack yet and did not wake possibly waiting conflicting requests.
+ * Removed from the tree from "drbd_process_done_ee" within the
+ * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+ * _drbd_clear_done_ee.
+ */
- do_wake = is_syncer_req
- ? list_empty(&mdev->sync_ee)
- : list_empty(&mdev->active_ee);
+ do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, FALSE);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ /* FIXME do we want to detach for failed REQ_DISCARD?
+ * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+ if (peer_req->flags & EE_WAS_ERROR)
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
- if (is_syncer_req)
- drbd_rs_complete_io(mdev, e_sector);
+ if (block_id == ID_SYNCER)
+ drbd_rs_complete_io(device, i.sector);
if (do_wake)
- wake_up(&mdev->ee_wait);
+ wake_up(&device->ee_wait);
if (do_al_complete_io)
- drbd_al_complete_io(mdev, e_sector);
+ drbd_al_complete_io(device, &i);
- wake_asender(mdev);
- put_ldev(mdev);
+ wake_asender(peer_device->connection);
+ put_ldev(device);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
-void drbd_endio_sec(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio, int error)
{
- struct drbd_epoch_entry *e = bio->bi_private;
- struct drbd_conf *mdev = e->mdev;
+ struct drbd_peer_request *peer_req = bio->bi_private;
+ struct drbd_device *device = peer_req->peer_device->device;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_write = bio_data_dir(bio) == WRITE;
+ int is_discard = !!(bio->bi_rw & REQ_DISCARD);
- if (error)
- dev_warn(DEV, "%s: error=%d s=%llus\n",
- is_write ? "write" : "read", error,
- (unsigned long long)e->sector);
+ if (error && __ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: error=%d s=%llus\n",
+ is_write ? (is_discard ? "discard" : "write")
+ : "read", error,
+ (unsigned long long)peer_req->i.sector);
if (!error && !uptodate) {
- dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
- is_write ? "write" : "read",
- (unsigned long long)e->sector);
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+ is_write ? "write" : "read",
+ (unsigned long long)peer_req->i.sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
@@ -178,30 +197,30 @@ void drbd_endio_sec(struct bio *bio, int error)
}
if (error)
- set_bit(__EE_WAS_ERROR, &e->flags);
+ set_bit(__EE_WAS_ERROR, &peer_req->flags);
bio_put(bio); /* no need for the bio anymore */
- if (atomic_dec_and_test(&e->pending_bios)) {
+ if (atomic_dec_and_test(&peer_req->pending_bios)) {
if (is_write)
- drbd_endio_write_sec_final(e);
+ drbd_endio_write_sec_final(peer_req);
else
- drbd_endio_read_sec_final(e);
+ drbd_endio_read_sec_final(peer_req);
}
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
-void drbd_endio_pri(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_device *device = req->device;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
if (!error && !uptodate) {
- dev_warn(DEV, "p %s: setting error to -EIO\n",
+ drbd_warn(device, "p %s: setting error to -EIO\n",
bio_data_dir(bio) == WRITE ? "write" : "read");
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
@@ -209,59 +228,76 @@ void drbd_endio_pri(struct bio *bio, int error)
error = -EIO;
}
+
+ /* If this request was aborted locally before,
+ * but now was completed "successfully",
+ * chances are that this caused arbitrary data corruption.
+ *
+ * "aborting" requests, or force-detaching the disk, is intended for
+ * completely blocked/hung local backing devices which do no longer
+ * complete requests at all, not even do error completions. In this
+ * situation, usually a hard-reset and failover is the only way out.
+ *
+ * By "aborting", basically faking a local error-completion,
+ * we allow for a more graceful swichover by cleanly migrating services.
+ * Still the affected node has to be rebooted "soon".
+ *
+ * By completing these requests, we allow the upper layers to re-use
+ * the associated data pages.
+ *
+ * If later the local backing device "recovers", and now DMAs some data
+ * from disk into the original request pages, in the best case it will
+ * just put random data into unused pages; but typically it will corrupt
+ * meanwhile completely unrelated data, causing all sorts of damage.
+ *
+ * Which means delayed successful completion,
+ * especially for READ requests,
+ * is a reason to panic().
+ *
+ * We assume that a delayed *error* completion is OK,
+ * though we still will complain noisily about it.
+ */
+ if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+ if (!error)
+ panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+ }
+
/* to avoid recursion in __req_mod */
if (unlikely(error)) {
- what = (bio_data_dir(bio) == WRITE)
- ? write_completed_with_error
+ if (bio->bi_rw & REQ_DISCARD)
+ what = (error == -EOPNOTSUPP)
+ ? DISCARD_COMPLETED_NOTSUPP
+ : DISCARD_COMPLETED_WITH_ERROR;
+ else
+ what = (bio_data_dir(bio) == WRITE)
+ ? WRITE_COMPLETED_WITH_ERROR
: (bio_rw(bio) == READ)
- ? read_completed_with_error
- : read_ahead_completed_with_error;
+ ? READ_COMPLETED_WITH_ERROR
+ : READ_AHEAD_COMPLETED_WITH_ERROR;
} else
- what = completed_ok;
+ what = COMPLETED_OK;
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
/* not req_mod(), we need irqsave here! */
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
__req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ put_ldev(device);
if (m.bio)
- complete_master_bio(mdev, &m);
-}
-
-int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- struct drbd_request *req = container_of(w, struct drbd_request, w);
-
- /* We should not detach for read io-error,
- * but try to WRITE the P_DATA_REPLY to the failed location,
- * to give the disk the chance to relocate that block */
-
- spin_lock_irq(&mdev->req_lock);
- if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
- _req_mod(req, read_retry_remote_canceled);
- spin_unlock_irq(&mdev->req_lock);
- return 1;
- }
- spin_unlock_irq(&mdev->req_lock);
-
- return w_send_read_req(mdev, w, 0);
-}
-
-int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- ERR_IF(cancel) return 1;
- dev_err(DEV, "resync inactive, but callback triggered??\n");
- return 1; /* Simply ignore this! */
+ complete_master_bio(device, &m);
}
-void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
struct page *tmp;
unsigned len;
@@ -278,18 +314,18 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e
page = tmp;
}
/* and now the last, possibly only partially used page */
- len = e->size & (PAGE_SIZE - 1);
+ len = peer_req->i.size & (PAGE_SIZE - 1);
sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
crypto_hash_update(&desc, &sg, sg.length);
crypto_hash_final(&desc, digest);
}
-void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
- struct bio_vec *bvec;
- int i;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
desc.tfm = tfm;
desc.flags = 0;
@@ -297,116 +333,128 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
sg_init_table(&sg, 1);
crypto_hash_init(&desc);
- __bio_for_each_segment(bvec, bio, i, 0) {
- sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+ bio_for_each_segment(bvec, bio, iter) {
+ sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
crypto_hash_update(&desc, &sg, sg.length);
}
crypto_hash_final(&desc, digest);
}
-static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
int digest_size;
void *digest;
- int ok;
+ int err = 0;
- D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+ if (unlikely(cancel))
+ goto out;
- if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
- return 1;
- }
+ if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+ goto out;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
- digest = kmalloc(digest_size, GFP_NOIO);
- if (digest) {
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
-
- inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev,
- e->sector,
- e->size,
- digest,
- digest_size,
- P_CSUM_RS_REQUEST);
- kfree(digest);
- } else {
- dev_err(DEV, "kmalloc() of digest failed.\n");
- ok = 0;
- }
- } else
- ok = 1;
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size,
+ digest, digest_size,
+ P_CSUM_RS_REQUEST);
+ kfree(digest);
+ } else {
+ drbd_err(device, "kmalloc() of digest failed.\n");
+ err = -ENOMEM;
+ }
- drbd_free_ee(mdev, e);
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
- if (unlikely(!ok))
- dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
- return ok;
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+ return err;
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
-static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
{
- struct drbd_epoch_entry *e;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
- if (!get_ldev(mdev))
+ if (!get_ldev(device))
return -EIO;
- if (drbd_rs_should_slow_down(mdev))
+ if (drbd_rs_should_slow_down(device, sector))
goto defer;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
- e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
- if (!e)
+ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+ size, true /* has real payload */, GFP_TRY);
+ if (!peer_req)
goto defer;
- e->w.cb = w_e_send_csum;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ peer_req->w.cb = w_e_send_csum;
+ spin_lock_irq(&device->resource->req_lock);
+ list_add(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
- atomic_add(size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+ atomic_add(size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
return 0;
- /* drbd_submit_ee currently fails for one reason only:
- * not being able to allocate enough bios.
- * Is dropping the connection going to help? */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ /* If it failed because of ENOMEM, retry should help. If it failed
+ * because bio_add_page failed (probably broken lower level driver),
+ * retry may or may not help.
+ * If it does not, you may need to force disconnect. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(device, peer_req);
defer:
- put_ldev(mdev);
+ put_ldev(device);
return -EAGAIN;
}
-void resync_timer_fn(unsigned long data)
+int w_resync_timer(struct drbd_work *w, int cancel)
{
- struct drbd_conf *mdev = (struct drbd_conf *) data;
- int queue;
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, resync_work);
- queue = 1;
- switch (mdev->state.conn) {
+ switch (device->state.conn) {
case C_VERIFY_S:
- mdev->resync_work.cb = w_make_ov_request;
+ make_ov_request(device, cancel);
break;
case C_SYNC_TARGET:
- mdev->resync_work.cb = w_make_resync_request;
+ make_resync_request(device, cancel);
break;
- default:
- queue = 0;
- mdev->resync_work.cb = w_resync_inactive;
}
- /* harmless race: list_empty outside data.work.q_lock */
- if (list_empty(&mdev->resync_work.list) && queue)
- drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+ return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+
+ if (list_empty(&device->resync_work.list))
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->resync_work);
}
static void fifo_set(struct fifo_buffer *fb, int value)
@@ -438,9 +486,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value)
fb->values[i] += value;
}
-int drbd_rs_controller(struct drbd_conf *mdev)
+struct fifo_buffer *fifo_alloc(int fifo_size)
{
- unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ struct fifo_buffer *fb;
+
+ fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+ if (!fb)
+ return NULL;
+
+ fb->head_index = 0;
+ fb->size = fifo_size;
+ fb->total = 0;
+
+ return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+ struct disk_conf *dc;
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
int correction; /* Number of sectors more we need in the proxy*/
@@ -448,167 +511,150 @@ int drbd_rs_controller(struct drbd_conf *mdev)
int steps; /* Number of time steps to plan ahead */
int curr_corr;
int max_sect;
+ struct fifo_buffer *plan;
- sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
- mdev->rs_in_flight -= sect_in;
-
- spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+ dc = rcu_dereference(device->ldev->disk_conf);
+ plan = rcu_dereference(device->rs_plan_s);
- steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+ steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
- if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
- want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
} else { /* normal path */
- want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
- sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ want = dc->c_fill_target ? dc->c_fill_target :
+ sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
}
- correction = want - mdev->rs_in_flight - mdev->rs_planed;
+ correction = want - device->rs_in_flight - plan->total;
/* Plan ahead */
cps = correction / steps;
- fifo_add_val(&mdev->rs_plan_s, cps);
- mdev->rs_planed += cps * steps;
+ fifo_add_val(plan, cps);
+ plan->total += cps * steps;
/* What we do in this step */
- curr_corr = fifo_push(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
- mdev->rs_planed -= curr_corr;
+ curr_corr = fifo_push(plan, 0);
+ plan->total -= curr_corr;
req_sect = sect_in + curr_corr;
if (req_sect < 0)
req_sect = 0;
- max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
if (req_sect > max_sect)
req_sect = max_sect;
/*
- dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
- sect_in, mdev->rs_in_flight, want, correction,
- steps, cps, mdev->rs_planed, curr_corr, req_sect);
+ drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, device->rs_in_flight, want, correction,
+ steps, cps, device->rs_planed, curr_corr, req_sect);
*/
return req_sect;
}
-int w_make_resync_request(struct drbd_conf *mdev,
- struct drbd_work *w, int cancel)
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ int number, mxb;
+
+ sect_in = atomic_xchg(&device->rs_sect_in, 0);
+ device->rs_in_flight -= sect_in;
+
+ rcu_read_lock();
+ mxb = drbd_get_max_buffers(device) / 2;
+ if (rcu_dereference(device->rs_plan_s)->size) {
+ number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+ device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+ number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+ rcu_read_unlock();
+
+ /* Don't have more than "max-buffers"/2 in-flight.
+ * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+ * potentially causing a distributed deadlock on congestion during
+ * online-verify or (checksum-based) resync, if max-buffers,
+ * socket buffer sizes and resync rate settings are mis-configured. */
+ if (mxb - device->rs_in_flight < number)
+ number = mxb - device->rs_in_flight;
+
+ return number;
+}
+
+static int make_resync_request(struct drbd_device *device, int cancel)
{
unsigned long bit;
sector_t sector;
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- int max_segment_size;
- int number, rollback_i, size, pe, mx;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ int max_bio_size;
+ int number, rollback_i, size;
int align, queued, sndbuf;
int i = 0;
if (unlikely(cancel))
- return 1;
-
- if (unlikely(mdev->state.conn < C_CONNECTED)) {
- dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
return 0;
- }
-
- if (mdev->state.conn != C_SYNC_TARGET)
- dev_err(DEV, "%s in w_make_resync_request\n",
- drbd_conn_str(mdev->state.conn));
- if (mdev->rs_total == 0) {
+ if (device->rs_total == 0) {
/* empty resync? */
- drbd_resync_finished(mdev);
- return 1;
+ drbd_resync_finished(device);
+ return 0;
}
- if (!get_ldev(mdev)) {
- /* Since we only need to access mdev->rsync a
- get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
+ if (!get_ldev(device)) {
+ /* Since we only need to access device->rsync a
+ get_ldev_if_state(device,D_FAILED) would be sufficient, but
to continue resync with a broken disk makes no sense at
all */
- dev_err(DEV, "Disk broke down during resync!\n");
- mdev->resync_work.cb = w_resync_inactive;
- return 1;
- }
-
- /* starting with drbd 8.3.8, we can handle multi-bio EEs,
- * if it should be necessary */
- max_segment_size =
- mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
- mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
-
- if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
- number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
- mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
- } else {
- mdev->c_sync_rate = mdev->sync_conf.rate;
- number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ drbd_err(device, "Disk broke down during resync!\n");
+ return 0;
}
- /* Throttle resync on lower level disk activity, which may also be
- * caused by application IO on Primary/SyncTarget.
- * Keep this after the call to drbd_rs_controller, as that assumes
- * to be called as precisely as possible every SLEEP_TIME,
- * and would be confused otherwise. */
- if (drbd_rs_should_slow_down(mdev))
+ max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+ number = drbd_rs_number_requests(device);
+ if (number <= 0)
goto requeue;
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket)
- mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
- else
- mx = 1;
- mutex_unlock(&mdev->data.mutex);
-
- /* For resync rates >160MB/sec, allow more pending RS requests */
- if (number > mx)
- mx = number;
-
- /* Limit the number of pending RS requests to no more than the peer's receive buffer */
- pe = atomic_read(&mdev->rs_pending_cnt);
- if ((pe + number) > mx) {
- number = mx - pe;
- }
-
for (i = 0; i < number; i++) {
/* Stop generating RS requests, when half of the send buffer is filled */
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket) {
- queued = mdev->data.socket->sk->sk_wmem_queued;
- sndbuf = mdev->data.socket->sk->sk_sndbuf;
+ mutex_lock(&first_peer_device(device)->connection->data.mutex);
+ if (first_peer_device(device)->connection->data.socket) {
+ queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
+ sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
} else {
queued = 1;
sndbuf = 0;
}
- mutex_unlock(&mdev->data.mutex);
+ mutex_unlock(&first_peer_device(device)->connection->data.mutex);
if (queued > sndbuf / 2)
goto requeue;
next_sector:
size = BM_BLOCK_SIZE;
- bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
+ bit = drbd_bm_find_next(device, device->bm_resync_fo);
- if (bit == -1UL) {
- mdev->bm_resync_fo = drbd_bm_bits(mdev);
- mdev->resync_work.cb = w_resync_inactive;
- put_ldev(mdev);
- return 1;
+ if (bit == DRBD_END_OF_BITMAP) {
+ device->bm_resync_fo = drbd_bm_bits(device);
+ put_ldev(device);
+ return 0;
}
sector = BM_BIT_TO_SECT(bit);
- if (drbd_try_rs_begin_io(mdev, sector)) {
- mdev->bm_resync_fo = bit;
+ if (drbd_rs_should_slow_down(device, sector) ||
+ drbd_try_rs_begin_io(device, sector)) {
+ device->bm_resync_fo = bit;
goto requeue;
}
- mdev->bm_resync_fo = bit + 1;
+ device->bm_resync_fo = bit + 1;
- if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
- drbd_rs_complete_io(mdev, sector);
+ if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+ drbd_rs_complete_io(device, sector);
goto next_sector;
}
-#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
/* try to find some adjacent bits.
* we stop if we have already the maximum req size.
*
@@ -617,8 +663,8 @@ next_sector:
*/
align = 1;
rollback_i = i;
- for (;;) {
- if (size + BM_BLOCK_SIZE > max_segment_size)
+ while (i < number) {
+ if (size + BM_BLOCK_SIZE > max_bio_size)
break;
/* Be always aligned */
@@ -633,7 +679,7 @@ next_sector:
* obscure reason; ( b == 0 ) would get the out-of-band
* only accidentally right because of the "oddly sized"
* adjustment below */
- if (drbd_bm_test_bit(mdev, bit+1) != 1)
+ if (drbd_bm_test_bit(device, bit+1) != 1)
break;
bit++;
size += BM_BLOCK_SIZE;
@@ -644,20 +690,21 @@ next_sector:
/* if we merged some,
* reset the offset to start the next drbd_bm_find_next from */
if (size > BM_BLOCK_SIZE)
- mdev->bm_resync_fo = bit + 1;
+ device->bm_resync_fo = bit + 1;
#endif
/* adjust very last sectors, in case we are oddly sized */
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
- if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
- switch (read_for_csum(mdev, sector, size)) {
+ if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
+ first_peer_device(device)->connection->csums_tfm) {
+ switch (read_for_csum(first_peer_device(device), sector, size)) {
case -EIO: /* Disk failure */
- put_ldev(mdev);
- return 0;
+ put_ldev(device);
+ return -EIO;
case -EAGAIN: /* allocation failed, or ldev busy */
- drbd_rs_complete_io(mdev, sector);
- mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ drbd_rs_complete_io(device, sector);
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
i = rollback_i;
goto requeue;
case 0:
@@ -667,157 +714,176 @@ next_sector:
BUG();
}
} else {
- inc_rs_pending(mdev);
- if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
- sector, size, ID_SYNCER)) {
- dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
- dec_rs_pending(mdev);
- put_ldev(mdev);
- return 0;
+ int err;
+
+ inc_rs_pending(device);
+ err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER);
+ if (err) {
+ drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+ dec_rs_pending(device);
+ put_ldev(device);
+ return err;
}
}
}
- if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
+ if (device->bm_resync_fo >= drbd_bm_bits(device)) {
/* last syncer _request_ was sent,
* but the P_RS_DATA_REPLY not yet received. sync will end (and
* next sync group will resume), as soon as we receive the last
* resync data block, and the last bit is cleared.
* until then resync "work" is "inactive" ...
*/
- mdev->resync_work.cb = w_resync_inactive;
- put_ldev(mdev);
- return 1;
+ put_ldev(device);
+ return 0;
}
requeue:
- mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
- mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
- put_ldev(mdev);
- return 1;
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+ put_ldev(device);
+ return 0;
}
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int make_ov_request(struct drbd_device *device, int cancel)
{
int number, i, size;
sector_t sector;
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ bool stop_sector_reached = false;
if (unlikely(cancel))
return 1;
- if (unlikely(mdev->state.conn < C_CONNECTED)) {
- dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
- return 0;
- }
-
- number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
- if (atomic_read(&mdev->rs_pending_cnt) > number)
- goto requeue;
-
- number -= atomic_read(&mdev->rs_pending_cnt);
+ number = drbd_rs_number_requests(device);
- sector = mdev->ov_position;
+ sector = device->ov_position;
for (i = 0; i < number; i++) {
- if (sector >= capacity) {
- mdev->resync_work.cb = w_resync_inactive;
+ if (sector >= capacity)
return 1;
- }
+
+ /* We check for "finished" only in the reply path:
+ * w_e_end_ov_reply().
+ * We need to send at least one request out. */
+ stop_sector_reached = i > 0
+ && verify_can_do_stop_sector(device)
+ && sector >= device->ov_stop_sector;
+ if (stop_sector_reached)
+ break;
size = BM_BLOCK_SIZE;
- if (drbd_try_rs_begin_io(mdev, sector)) {
- mdev->ov_position = sector;
+ if (drbd_rs_should_slow_down(device, sector) ||
+ drbd_try_rs_begin_io(device, sector)) {
+ device->ov_position = sector;
goto requeue;
}
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
- inc_rs_pending(mdev);
- if (!drbd_send_ov_request(mdev, sector, size)) {
- dec_rs_pending(mdev);
+ inc_rs_pending(device);
+ if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+ dec_rs_pending(device);
return 0;
}
sector += BM_SECT_PER_BIT;
}
- mdev->ov_position = sector;
+ device->ov_position = sector;
requeue:
- mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ if (i == 0 || !stop_sector_reached)
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
return 1;
}
-
-int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_ov_finished(struct drbd_work *w, int cancel)
{
- kfree(w);
- ov_oos_print(mdev);
- drbd_resync_finished(mdev);
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
- return 1;
+ return 0;
}
-static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_resync_finished(struct drbd_work *w, int cancel)
{
- kfree(w);
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
- drbd_resync_finished(mdev);
+ drbd_resync_finished(device);
- return 1;
+ return 0;
}
-static void ping_peer(struct drbd_conf *mdev)
+static void ping_peer(struct drbd_device *device)
{
- clear_bit(GOT_PING_ACK, &mdev->flags);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ clear_bit(GOT_PING_ACK, &connection->flags);
+ request_ping(connection);
+ wait_event(connection->ping_wait,
+ test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
}
-int drbd_resync_finished(struct drbd_conf *mdev)
+int drbd_resync_finished(struct drbd_device *device)
{
unsigned long db, dt, dbdt;
unsigned long n_oos;
union drbd_state os, ns;
- struct drbd_work *w;
+ struct drbd_device_work *dw;
char *khelper_cmd = NULL;
+ int verify_done = 0;
/* Remove all elements from the resync LRU. Since future actions
* might set bits in the (main) bitmap, then the entries in the
* resync LRU would be wrong. */
- if (drbd_rs_del_all(mdev)) {
+ if (drbd_rs_del_all(device)) {
/* In case this is not possible now, most probably because
* there are P_RS_DATA_REPLY Packets lingering on the worker's
* queue (or even the read operations for those packets
* is not finished by now). Retry in 100ms. */
- drbd_kick_lo(mdev);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 10);
- w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
- if (w) {
- w->cb = w_resync_finished;
- drbd_queue_work(&mdev->data.work, w);
+ schedule_timeout_interruptible(HZ / 10);
+ dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+ if (dw) {
+ dw->w.cb = w_resync_finished;
+ dw->device = device;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &dw->w);
return 1;
}
- dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
+ drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
}
- dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
if (dt <= 0)
dt = 1;
- db = mdev->rs_total;
+
+ db = device->rs_total;
+ /* adjust for verify start and stop sectors, respective reached position */
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ db -= device->ov_left;
+
dbdt = Bit2KB(db/dt);
- mdev->rs_paused /= HZ;
+ device->rs_paused /= HZ;
- if (!get_ldev(mdev))
+ if (!get_ldev(device))
goto out;
- ping_peer(mdev);
+ ping_peer(device);
+
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
+ verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
/* This protects us against multiple calls (that can happen in the presence
of application IO), and against connectivity loss just before we arrive here. */
@@ -827,42 +893,41 @@ int drbd_resync_finished(struct drbd_conf *mdev)
ns = os;
ns.conn = C_CONNECTED;
- dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
- (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
- "Online verify " : "Resync",
- dt + mdev->rs_paused, mdev->rs_paused, dbdt);
+ drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+ verify_done ? "Online verify" : "Resync",
+ dt + device->rs_paused, device->rs_paused, dbdt);
- n_oos = drbd_bm_total_weight(mdev);
+ n_oos = drbd_bm_total_weight(device);
if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
if (n_oos) {
- dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
+ drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
n_oos, Bit2KB(1));
khelper_cmd = "out-of-sync";
}
} else {
- D_ASSERT((n_oos - mdev->rs_failed) == 0);
+ D_ASSERT(device, (n_oos - device->rs_failed) == 0);
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
khelper_cmd = "after-resync-target";
- if (mdev->csums_tfm && mdev->rs_total) {
- const unsigned long s = mdev->rs_same_csum;
- const unsigned long t = mdev->rs_total;
+ if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+ const unsigned long s = device->rs_same_csum;
+ const unsigned long t = device->rs_total;
const int ratio =
(t == 0) ? 0 :
(t < 100000) ? ((s*100)/t) : (s/(t/100));
- dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
+ drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
"transferred %luK total %luK\n",
ratio,
- Bit2KB(mdev->rs_same_csum),
- Bit2KB(mdev->rs_total - mdev->rs_same_csum),
- Bit2KB(mdev->rs_total));
+ Bit2KB(device->rs_same_csum),
+ Bit2KB(device->rs_total - device->rs_same_csum),
+ Bit2KB(device->rs_total));
}
}
- if (mdev->rs_failed) {
- dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
+ if (device->rs_failed) {
+ drbd_info(device, " %lu failed blocks\n", device->rs_failed);
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
ns.disk = D_INCONSISTENT;
@@ -876,445 +941,523 @@ int drbd_resync_finished(struct drbd_conf *mdev)
ns.pdsk = D_UP_TO_DATE;
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
- if (mdev->p_uuid) {
+ if (device->p_uuid) {
int i;
for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
- _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
- drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
- _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, i, device->p_uuid[i]);
+ drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
} else {
- dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
+ drbd_err(device, "device->p_uuid is NULL! BUG\n");
}
}
- drbd_uuid_set_bm(mdev, 0UL);
-
- if (mdev->p_uuid) {
- /* Now the two UUID sets are equal, update what we
- * know of the peer. */
- int i;
- for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
- mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+ if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+ /* for verify runs, we don't update uuids here,
+ * so there would be nothing to report. */
+ drbd_uuid_set_bm(device, 0UL);
+ drbd_print_uuids(device, "updated UUIDs");
+ if (device->p_uuid) {
+ /* Now the two UUID sets are equal, update what we
+ * know of the peer. */
+ int i;
+ for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+ device->p_uuid[i] = device->ldev->md.uuid[i];
+ }
}
}
- _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+ _drbd_set_state(device, ns, CS_VERBOSE, NULL);
out_unlock:
- spin_unlock_irq(&mdev->req_lock);
- put_ldev(mdev);
+ spin_unlock_irq(&device->resource->req_lock);
+ put_ldev(device);
out:
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- mdev->rs_paused = 0;
- mdev->ov_start_sector = 0;
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ device->rs_paused = 0;
- drbd_md_sync(mdev);
+ /* reset start sector, if we reached end of device */
+ if (verify_done && device->ov_left == 0)
+ device->ov_start_sector = 0;
- if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
- dev_info(DEV, "Writing the whole bitmap\n");
- drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
- }
+ drbd_md_sync(device);
if (khelper_cmd)
- drbd_khelper(mdev, khelper_cmd);
+ drbd_khelper(device, khelper_cmd);
return 1;
}
/* helper */
-static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
- if (drbd_ee_has_active_page(e)) {
+ if (drbd_peer_req_has_active_page(peer_req)) {
/* This might happen if sendpage() has not finished */
- int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
- atomic_add(i, &mdev->pp_in_use_by_net);
- atomic_sub(i, &mdev->pp_in_use);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->net_ee);
- spin_unlock_irq(&mdev->req_lock);
+ int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &device->pp_in_use_by_net);
+ atomic_sub(i, &device->pp_in_use);
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->net_ee);
+ spin_unlock_irq(&device->resource->req_lock);
wake_up(&drbd_pp_wait);
} else
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(device, peer_req);
}
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_data_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
- dec_unacked(mdev);
- return 1;
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
}
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
- (unsigned long long)e->sector);
+ drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+ err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
}
- dec_unacked(mdev);
+ dec_unacked(device);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(device, peer_req);
- if (unlikely(!ok))
- dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
}
/**
- * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
- * @mdev: DRBD device.
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
- dec_unacked(mdev);
- return 1;
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
}
- if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_rs_complete_io(mdev, e->sector);
- put_ldev(mdev);
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
}
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
- inc_rs_pending(mdev);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ if (device->state.conn == C_AHEAD) {
+ err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+ } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+ inc_rs_pending(device);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Not sending RSDataReply, "
+ drbd_err(device, "Not sending RSDataReply, "
"partner DISKLESS!\n");
- ok = 1;
+ err = 0;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
- (unsigned long long)e->sector);
+ drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
/* update resync data with failure */
- drbd_rs_failed_io(mdev, e->sector, e->size);
+ drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
}
- dec_unacked(mdev);
+ dec_unacked(device);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(device, peer_req);
- if (unlikely(!ok))
- dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
}
-int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
struct digest_info *di;
int digest_size;
void *digest = NULL;
- int ok, eq = 0;
+ int err, eq = 0;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
- dec_unacked(mdev);
- return 1;
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
}
- if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
- put_ldev(mdev);
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
- if (mdev->csums_tfm) {
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
- D_ASSERT(digest_size == di->digest_size);
+ if (peer_device->connection->csums_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ D_ASSERT(device, digest_size == di->digest_size);
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (eq) {
- drbd_set_in_sync(mdev, e->sector, e->size);
+ drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
/* rs_same_csums unit is BM_BLOCK_SIZE */
- mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
- ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+ device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+ err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
} else {
- inc_rs_pending(mdev);
- e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
- e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ inc_rs_pending(device);
+ peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
kfree(di);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
}
} else {
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+ drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}
- dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ dec_unacked(device);
+ move_to_net_ee_or_free(device, peer_req);
- if (unlikely(!ok))
- dev_err(DEV, "drbd_send_block/ack() failed\n");
- return ok;
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block/ack() failed\n");
+ return err;
}
-int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
int digest_size;
void *digest;
- int ok = 1;
+ int err = 0;
if (unlikely(cancel))
goto out;
- if (unlikely((e->flags & EE_WAS_ERROR) != 0))
- goto out;
-
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
- /* FIXME if this allocation fails, online verify will not terminate! */
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
- if (digest) {
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
- inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
- digest, digest_size, P_OV_REPLY);
- if (!ok)
- dec_rs_pending(mdev);
- kfree(digest);
+ if (!digest) {
+ err = 1; /* terminate the connection in case the allocation failed */
+ goto out;
}
-out:
- drbd_free_ee(mdev, e);
-
- dec_unacked(mdev);
+ if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+ else
+ memset(digest, 0, digest_size);
+
+ /* Free e and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+ if (err)
+ dec_rs_pending(device);
+ kfree(digest);
- return ok;
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return err;
}
-void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
{
- if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
- mdev->ov_last_oos_size += size>>9;
+ if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+ device->ov_last_oos_size += size>>9;
} else {
- mdev->ov_last_oos_start = sector;
- mdev->ov_last_oos_size = size>>9;
+ device->ov_last_oos_start = sector;
+ device->ov_last_oos_size = size>>9;
}
- drbd_set_out_of_sync(mdev, sector, size);
- set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+ drbd_set_out_of_sync(device, sector, size);
}
-int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
struct digest_info *di;
- int digest_size;
void *digest;
- int ok, eq = 0;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ int digest_size;
+ int err, eq = 0;
+ bool stop_sector_reached = false;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
- dec_unacked(mdev);
- return 1;
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
}
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
- if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
- put_ldev(mdev);
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
- D_ASSERT(digest_size == di->digest_size);
+ D_ASSERT(device, digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
- } else {
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
- dec_unacked(mdev);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
if (!eq)
- drbd_ov_oos_found(mdev, e->sector, e->size);
+ drbd_ov_out_of_sync_found(device, sector, size);
else
- ov_oos_print(mdev);
+ ov_out_of_sync_print(device);
- ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
- eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+ err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+ eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
- drbd_free_ee(mdev, e);
+ dec_unacked(device);
- if (--mdev->ov_left == 0) {
- ov_oos_print(mdev);
- drbd_resync_finished(mdev);
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ stop_sector_reached = verify_can_do_stop_sector(device) &&
+ (sector + (size>>9)) >= device->ov_stop_sector;
+
+ if (device->ov_left == 0 || stop_sector_reached) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
}
- return ok;
+ return err;
}
-int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
{
- struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
- complete(&b->done);
- return 1;
+ struct p_barrier *p;
+ struct drbd_socket *sock;
+
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->barrier = connection->send.current_epoch_nr;
+ p->pad = 0;
+ connection->send.current_epoch_writes = 0;
+
+ return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
}
-int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_write_hint(struct drbd_work *w, int cancel)
{
- struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
- struct p_barrier *p = &mdev->data.sbuf.barrier;
- int ok = 1;
-
- /* really avoid racing with tl_clear. w.cb may have been referenced
- * just before it was reassigned and re-queued, so double check that.
- * actually, this race was harmless, since we only try to send the
- * barrier packet here, and otherwise do nothing with the object.
- * but compare with the head of w_clear_epoch */
- spin_lock_irq(&mdev->req_lock);
- if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
- cancel = 1;
- spin_unlock_irq(&mdev->req_lock);
- if (cancel)
- return 1;
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, unplug_work);
+ struct drbd_socket *sock;
- if (!drbd_get_data_sock(mdev))
+ if (cancel)
return 0;
- p->barrier = b->br_number;
- /* inc_ap_pending was done where this was queued.
- * dec_ap_pending will be done in got_BarrierAck
- * or (on connection loss) in w_clear_epoch. */
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header80 *)p, sizeof(*p), 0);
- drbd_put_data_sock(mdev);
-
- return ok;
+ sock = &first_peer_device(device)->connection->data;
+ if (!drbd_prepare_command(first_peer_device(device), sock))
+ return -EIO;
+ return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
}
-int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
{
- if (cancel)
- return 1;
- return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+ if (!connection->send.seen_any_write_yet) {
+ connection->send.seen_any_write_yet = true;
+ connection->send.current_epoch_nr = epoch;
+ connection->send.current_epoch_writes = 0;
+ }
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+ /* re-init if first write on this connection */
+ if (!connection->send.seen_any_write_yet)
+ return;
+ if (connection->send.current_epoch_nr != epoch) {
+ if (connection->send.current_epoch_writes)
+ drbd_send_barrier(connection);
+ connection->send.current_epoch_nr = epoch;
+ }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+
+ /* this time, no connection->send.current_epoch_writes++;
+ * If it was sent, it was the closing barrier for the last
+ * replicated epoch, before we went into AHEAD mode.
+ * No more barriers will be sent, until we leave AHEAD mode again. */
+ maybe_send_barrier(connection, req->epoch);
+
+ err = drbd_send_out_of_sync(first_peer_device(device), req);
+ req_mod(req, OOS_HANDED_TO_NETWORK);
+
+ return err;
}
/**
* w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
- * @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_dblock(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_device *device = req->device;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_dblock(mdev, req);
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ re_init_if_first_write(connection, req->epoch);
+ maybe_send_barrier(connection, req->epoch);
+ connection->send.current_epoch_writes++;
+
+ err = drbd_send_dblock(first_peer_device(device), req);
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
- return ok;
+ return err;
}
/**
* w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
- * @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_read_req(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_device *device = req->device;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
- (unsigned long)req);
+ /* Even read requests may close a write epoch,
+ * if there was any yet. */
+ maybe_send_barrier(connection, req->epoch);
- if (!ok) {
- /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
- * so this is probably redundant */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
- }
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+ (unsigned long)req);
+
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
- return ok;
+ return err;
}
-int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_restart_disk_io(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_begin_io(mdev, req->sector);
- /* Calling drbd_al_begin_io() out of the worker might deadlocks
- theoretically. Practically it can not deadlock, since this is
- only used when unfreezing IOs. All the extents of the requests
- that made it into the TL are already active */
+ drbd_al_begin_io(device, &req->i, false);
drbd_req_make_private_bio(req, req->master_bio);
- req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+ req->private_bio->bi_bdev = device->ldev->backing_bdev;
generic_make_request(req->private_bio);
- return 1;
+ return 0;
}
-static int _drbd_may_sync_now(struct drbd_conf *mdev)
+static int _drbd_may_sync_now(struct drbd_device *device)
{
- struct drbd_conf *odev = mdev;
+ struct drbd_device *odev = device;
+ int resync_after;
while (1) {
- if (odev->sync_conf.after == -1)
+ if (!odev->ldev || odev->state.disk == D_DISKLESS)
+ return 1;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ if (resync_after == -1)
+ return 1;
+ odev = minor_to_device(resync_after);
+ if (!odev)
return 1;
- odev = minor_to_mdev(odev->sync_conf.after);
- ERR_IF(!odev) return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
odev->state.aftr_isp || odev->state.peer_isp ||
@@ -1325,44 +1468,41 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
/**
* _drbd_pause_after() - Pause resync on all devices that may not resync now
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Called from process context only (admin command and after_state_ch).
*/
-static int _drbd_pause_after(struct drbd_conf *mdev)
+static int _drbd_pause_after(struct drbd_device *device)
{
- struct drbd_conf *odev;
+ struct drbd_device *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (!_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
!= SS_NOTHING_TO_DO);
}
+ rcu_read_unlock();
return rv;
}
/**
* _drbd_resume_next() - Resume resync on all devices that may resync now
- * @mdev: DRBD device.
+ * @device: DRBD device.
*
* Called from process context only (admin command and worker).
*/
-static int _drbd_resume_next(struct drbd_conf *mdev)
+static int _drbd_resume_next(struct drbd_device *device)
{
- struct drbd_conf *odev;
+ struct drbd_device *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (odev->state.aftr_isp) {
@@ -1372,123 +1512,196 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
!= SS_NOTHING_TO_DO) ;
}
}
+ rcu_read_unlock();
return rv;
}
-void resume_next_sg(struct drbd_conf *mdev)
+void resume_next_sg(struct drbd_device *device)
{
write_lock_irq(&global_state_lock);
- _drbd_resume_next(mdev);
+ _drbd_resume_next(device);
write_unlock_irq(&global_state_lock);
}
-void suspend_other_sg(struct drbd_conf *mdev)
+void suspend_other_sg(struct drbd_device *device)
{
write_lock_irq(&global_state_lock);
- _drbd_pause_after(mdev);
+ _drbd_pause_after(device);
write_unlock_irq(&global_state_lock);
}
-static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
{
- struct drbd_conf *odev;
+ struct drbd_device *odev;
+ int resync_after;
if (o_minor == -1)
return NO_ERROR;
- if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
- return ERR_SYNC_AFTER;
+ if (o_minor < -1 || o_minor > MINORMASK)
+ return ERR_RESYNC_AFTER;
/* check for loops */
- odev = minor_to_mdev(o_minor);
+ odev = minor_to_device(o_minor);
while (1) {
- if (odev == mdev)
- return ERR_SYNC_AFTER_CYCLE;
+ if (odev == device)
+ return ERR_RESYNC_AFTER_CYCLE;
+
+ /* You are free to depend on diskless, non-existing,
+ * or not yet/no longer existing minors.
+ * We only reject dependency loops.
+ * We cannot follow the dependency chain beyond a detached or
+ * missing minor.
+ */
+ if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+ return NO_ERROR;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
/* dependency chain ends here, no cycles. */
- if (odev->sync_conf.after == -1)
+ if (resync_after == -1)
return NO_ERROR;
/* follow the dependency chain */
- odev = minor_to_mdev(odev->sync_conf.after);
+ odev = minor_to_device(resync_after);
}
}
-int drbd_alter_sa(struct drbd_conf *mdev, int na)
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
{
int changes;
- int retcode;
- write_lock_irq(&global_state_lock);
- retcode = sync_after_error(mdev, na);
- if (retcode == NO_ERROR) {
- mdev->sync_conf.after = na;
- do {
- changes = _drbd_pause_after(mdev);
- changes |= _drbd_resume_next(mdev);
- } while (changes);
+ do {
+ changes = _drbd_pause_after(device);
+ changes |= _drbd_resume_next(device);
+ } while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+ struct fifo_buffer *plan;
+
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ device->rs_in_flight = 0;
+
+ /* Updating the RCU protected object in place is necessary since
+ this function gets called from atomic context.
+ It is valid since all other updates also lead to an completely
+ empty fifo */
+ rcu_read_lock();
+ plan = rcu_dereference(device->rs_plan_s);
+ plan->total = 0;
+ fifo_set(plan, 0);
+ rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->start_resync_work);
+}
+
+int w_start_resync(struct drbd_work *w, int cancel)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, start_resync_work);
+
+ if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+ drbd_warn(device, "w_start_resync later...\n");
+ device->start_resync_timer.expires = jiffies + HZ/10;
+ add_timer(&device->start_resync_timer);
+ return 0;
}
- write_unlock_irq(&global_state_lock);
- return retcode;
+
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+ return 0;
}
/**
* drbd_start_resync() - Start the resync process
- * @mdev: DRBD device.
+ * @device: DRBD device.
* @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
*
* This function might bring you directly into one of the
* C_PAUSED_SYNC_* states.
*/
-void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
{
union drbd_state ns;
int r;
- if (mdev->state.conn >= C_SYNC_SOURCE) {
- dev_err(DEV, "Resync already running!\n");
+ if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+ drbd_err(device, "Resync already running!\n");
return;
}
- /* In case a previous resync run was aborted by an IO error/detach on the peer. */
- drbd_rs_cancel_all(mdev);
-
- if (side == C_SYNC_TARGET) {
- /* Since application IO was locked out during C_WF_BITMAP_T and
- C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
- we check that we might make the data inconsistent. */
- r = drbd_khelper(mdev, "before-resync-target");
- r = (r >> 8) & 0xff;
- if (r > 0) {
- dev_info(DEV, "before-resync-target handler returned %d, "
- "dropping connection.\n", r);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return;
+ if (!test_bit(B_RS_H_DONE, &device->flags)) {
+ if (side == C_SYNC_TARGET) {
+ /* Since application IO was locked out during C_WF_BITMAP_T and
+ C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+ we check that we might make the data inconsistent. */
+ r = drbd_khelper(device, "before-resync-target");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ drbd_info(device, "before-resync-target handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ } else /* C_SYNC_SOURCE */ {
+ r = drbd_khelper(device, "before-resync-source");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ if (r == 3) {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "ignoring. Old userland tools?", r);
+ } else {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(first_peer_device(device)->connection,
+ NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ }
}
}
- drbd_state_lock(mdev);
-
- if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
- drbd_state_unlock(mdev);
- return;
+ if (current == first_peer_device(device)->connection->worker.task) {
+ /* The worker should not sleep waiting for state_mutex,
+ that can take long */
+ if (!mutex_trylock(device->state_mutex)) {
+ set_bit(B_RS_H_DONE, &device->flags);
+ device->start_resync_timer.expires = jiffies + HZ/5;
+ add_timer(&device->start_resync_timer);
+ return;
+ }
+ } else {
+ mutex_lock(device->state_mutex);
}
-
- if (side == C_SYNC_TARGET) {
- mdev->bm_resync_fo = 0;
- } else /* side == C_SYNC_SOURCE */ {
- u64 uuid;
-
- get_random_bytes(&uuid, sizeof(u64));
- drbd_uuid_set(mdev, UI_BITMAP, uuid);
- drbd_send_sync_uuid(mdev, uuid);
-
- D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
+ clear_bit(B_RS_H_DONE, &device->flags);
+
+ /* req_lock: serialize with drbd_send_and_submit() and others
+ * global_state_lock: for stable sync-after dependencies */
+ spin_lock_irq(&device->resource->req_lock);
+ write_lock(&global_state_lock);
+ /* Did some connection breakage or IO error race with us? */
+ if (device->state.conn < C_CONNECTED
+ || !get_ldev_if_state(device, D_NEGOTIATING)) {
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+ mutex_unlock(device->state_mutex);
+ return;
}
- write_lock_irq(&global_state_lock);
- ns = mdev->state;
+ ns = drbd_read_state(device);
- ns.aftr_isp = !_drbd_may_sync_now(mdev);
+ ns.aftr_isp = !_drbd_may_sync_now(device);
ns.conn = side;
@@ -1497,40 +1710,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
else /* side == C_SYNC_SOURCE */
ns.pdsk = D_INCONSISTENT;
- r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
+ r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ ns = drbd_read_state(device);
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
- unsigned long tw = drbd_bm_total_weight(mdev);
+ unsigned long tw = drbd_bm_total_weight(device);
unsigned long now = jiffies;
int i;
- mdev->rs_failed = 0;
- mdev->rs_paused = 0;
- mdev->rs_same_csum = 0;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
- mdev->rs_total = tw;
- mdev->rs_start = now;
+ device->rs_failed = 0;
+ device->rs_paused = 0;
+ device->rs_same_csum = 0;
+ device->rs_last_events = 0;
+ device->rs_last_sect_ev = 0;
+ device->rs_total = tw;
+ device->rs_start = now;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = tw;
- mdev->rs_mark_time[i] = now;
+ device->rs_mark_left[i] = tw;
+ device->rs_mark_time[i] = now;
}
- _drbd_pause_after(mdev);
+ _drbd_pause_after(device);
}
- write_unlock_irq(&global_state_lock);
- put_ldev(mdev);
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
if (r == SS_SUCCESS) {
- dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
- drbd_conn_str(ns.conn),
- (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
- (unsigned long) mdev->rs_total);
+ /* reset rs_last_bcast when a resync or verify is started,
+ * to deal with potential jiffies wrap. */
+ device->rs_last_bcast = jiffies - HZ;
- if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+ drbd_conn_str(ns.conn),
+ (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+ (unsigned long) device->rs_total);
+ if (side == C_SYNC_TARGET)
+ device->bm_resync_fo = 0;
+
+ /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+ * with w_send_oos, or the sync target will get confused as to
+ * how much bits to resync. We cannot do that always, because for an
+ * empty resync and protocol < 95, we need to do it here, as we call
+ * drbd_resync_finished from here in that case.
+ * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+ * and from after_state_ch otherwise. */
+ if (side == C_SYNC_SOURCE &&
+ first_peer_device(device)->connection->agreed_pro_version < 96)
+ drbd_gen_and_send_sync_uuid(first_peer_device(device));
+
+ if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
+ device->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
@@ -1541,137 +1772,187 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
- if (side == C_SYNC_SOURCE)
- schedule_timeout_interruptible(
- mdev->net_conf->ping_int * HZ +
- mdev->net_conf->ping_timeo*HZ/9);
- drbd_resync_finished(mdev);
+ if (side == C_SYNC_SOURCE) {
+ struct net_conf *nc;
+ int timeo;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ }
+ drbd_resync_finished(device);
}
- atomic_set(&mdev->rs_sect_in, 0);
- atomic_set(&mdev->rs_sect_ev, 0);
- mdev->rs_in_flight = 0;
- mdev->rs_planed = 0;
- spin_lock(&mdev->peer_seq_lock);
- fifo_set(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
- /* ns.conn may already be != mdev->state.conn,
+ drbd_rs_controller_reset(device);
+ /* ns.conn may already be != device->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
* No matter, that is handled in resync_timer_fn() */
if (ns.conn == C_SYNC_TARGET)
- mod_timer(&mdev->resync_timer, jiffies);
+ mod_timer(&device->resync_timer, jiffies);
- drbd_md_sync(mdev);
+ drbd_md_sync(device);
}
- drbd_state_unlock(mdev);
+ put_ldev(device);
+ mutex_unlock(device->state_mutex);
}
-int drbd_worker(struct drbd_thread *thi)
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
{
- struct drbd_conf *mdev = thi->mdev;
- struct drbd_work *w = NULL;
- LIST_HEAD(work_list);
- int intr = 0, i;
+ spin_lock_irq(&queue->q_lock);
+ list_splice_init(&queue->q, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
- sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ if (!list_empty(&queue->q))
+ list_move(queue->q.next, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
- while (get_t_state(thi) == Running) {
- drbd_thread_current_set_cpu(mdev);
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+ DEFINE_WAIT(wait);
+ struct net_conf *nc;
+ int uncork, cork;
- if (down_trylock(&mdev->data.work.s)) {
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_uncork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
+ dequeue_work_item(&connection->sender_work, work_list);
+ if (!list_empty(work_list))
+ return;
- intr = down_interruptible(&mdev->data.work.s);
+ /* Still nothing to do?
+ * Maybe we still need to close the current epoch,
+ * even if no new requests are queued yet.
+ *
+ * Also, poke TCP, just in case.
+ * Then wait for new work (or signal). */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ uncork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ if (uncork) {
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket)
+ drbd_tcp_uncork(connection->data.socket);
+ mutex_unlock(&connection->data.mutex);
+ }
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_cork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
+ for (;;) {
+ int send_barrier;
+ prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_lock_irq(&connection->resource->req_lock);
+ spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ /* dequeue single item only,
+ * we still use drbd_queue_work_front() in some places */
+ if (!list_empty(&connection->sender_work.q))
+ list_move(connection->sender_work.q.next, work_list);
+ spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(work_list) || signal_pending(current)) {
+ spin_unlock_irq(&connection->resource->req_lock);
+ break;
}
- if (intr) {
- D_ASSERT(intr == -EINTR);
+ /* We found nothing new to do, no to-be-communicated request,
+ * no other work item. We may still need to close the last
+ * epoch. Next incoming request epoch will be connection ->
+ * current transfer log epoch number. If that is different
+ * from the epoch of the last request we communicated, it is
+ * safe to send the epoch separating barrier now.
+ */
+ send_barrier =
+ atomic_read(&connection->current_tle_nr) !=
+ connection->send.current_epoch_nr;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (send_barrier)
+ maybe_send_barrier(connection,
+ connection->send.current_epoch_nr + 1);
+ schedule();
+ /* may be woken up for other things but new work, too,
+ * e.g. if the current epoch got closed.
+ * In which case we send the barrier above. */
+ }
+ finish_wait(&connection->sender_work.q_wait, &wait);
+
+ /* someone may have changed the config while we have been waiting above. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ cork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ if (cork)
+ drbd_tcp_cork(connection->data.socket);
+ else if (!uncork)
+ drbd_tcp_uncork(connection->data.socket);
+ }
+ mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ struct drbd_work *w = NULL;
+ struct drbd_peer_device *peer_device;
+ LIST_HEAD(work_list);
+ int vnr;
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ /* as long as we use drbd_queue_work_front(),
+ * we may only dequeue single work items here, not batches. */
+ if (list_empty(&work_list))
+ wait_for_work(connection, &work_list);
+
+ if (signal_pending(current)) {
flush_signals(current);
- ERR_IF (get_t_state(thi) == Running)
+ if (get_t_state(thi) == RUNNING) {
+ drbd_warn(connection, "Worker got an unexpected signal\n");
continue;
+ }
break;
}
- if (get_t_state(thi) != Running)
+ if (get_t_state(thi) != RUNNING)
break;
- /* With this break, we have done a down() but not consumed
- the entry from the list. The cleanup code takes care of
- this... */
-
- w = NULL;
- spin_lock_irq(&mdev->data.work.q_lock);
- ERR_IF(list_empty(&mdev->data.work.q)) {
- /* something terribly wrong in our logic.
- * we were able to down() the semaphore,
- * but the list is empty... doh.
- *
- * what is the best thing to do now?
- * try again from scratch, restarting the receiver,
- * asender, whatnot? could break even more ugly,
- * e.g. when we are primary, but no good local data.
- *
- * I'll try to get away just starting over this loop.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
- continue;
- }
- w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
- list_del_init(&w->list);
- spin_unlock_irq(&mdev->data.work.q_lock);
-
- if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
- /* dev_warn(DEV, "worker: a callback failed! \n"); */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev,
- NS(conn, C_NETWORK_FAILURE));
+
+ while (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+ continue;
+ if (connection->cstate >= C_WF_REPORT_PARAMS)
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
}
}
- D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
- D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
-
- spin_lock_irq(&mdev->data.work.q_lock);
- i = 0;
- while (!list_empty(&mdev->data.work.q)) {
- list_splice_init(&mdev->data.work.q, &work_list);
- spin_unlock_irq(&mdev->data.work.q_lock);
+ do {
while (!list_empty(&work_list)) {
- w = list_entry(work_list.next, struct drbd_work, list);
+ w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
- w->cb(mdev, w, 1);
- i++; /* dead debugging code */
+ w->cb(w, 1);
}
-
- spin_lock_irq(&mdev->data.work.q_lock);
+ dequeue_work_batch(&connection->sender_work, &work_list);
+ } while (!list_empty(&work_list));
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_device_cleanup(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
}
- sema_init(&mdev->data.work.s, 0);
- /* DANGEROUS race: if someone did queue his work within the spinlock,
- * but up() ed outside the spinlock, we could get an up() on the
- * semaphore without corresponding list entry.
- * So don't do that.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
-
- D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
- /* _drbd_set_state only uses stop_nowait.
- * wait here for the Exiting receiver. */
- drbd_thread_stop(&mdev->receiver);
- drbd_mdev_cleanup(mdev);
-
- dev_info(DEV, "worker terminated\n");
-
- clear_bit(DEVICE_DYING, &mdev->flags);
- clear_bit(CONFIG_PENDING, &mdev->flags);
- wake_up(&mdev->state_wait);
+ rcu_read_unlock();
return 0;
}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
deleted file mode 100644
index defdb5013ea..00000000000
--- a/drivers/block/drbd/drbd_wrappers.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _DRBD_WRAPPERS_H
-#define _DRBD_WRAPPERS_H
-
-#include <linux/ctype.h>
-#include <linux/mm.h>
-
-/* see get_sb_bdev and bd_claim */
-extern char *drbd_sec_holder;
-
-/* sets the number of 512 byte sectors of our virtual device */
-static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
- sector_t size)
-{
- /* set_capacity(mdev->this_bdev->bd_disk, size); */
- set_capacity(mdev->vdisk, size);
- mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
-}
-
-#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
-
-/* bi_end_io handlers */
-extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_sec(struct bio *bio, int error);
-extern void drbd_endio_pri(struct bio *bio, int error);
-
-/*
- * used to submit our private bio
- */
-static inline void drbd_generic_make_request(struct drbd_conf *mdev,
- int fault_type, struct bio *bio)
-{
- __release(local);
- if (!bio->bi_bdev) {
- printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
- "bio->bi_bdev == NULL\n",
- mdev_to_minor(mdev));
- dump_stack();
- bio_endio(bio, -ENODEV);
- return;
- }
-
- if (FAULT_ACTIVE(mdev, fault_type))
- bio_endio(bio, -EIO);
- else
- generic_make_request(bio);
-}
-
-static inline void drbd_plug_device(struct drbd_conf *mdev)
-{
- struct request_queue *q;
- q = bdev_get_queue(mdev->this_bdev);
-
- spin_lock_irq(q->queue_lock);
-
-/* XXX the check on !blk_queue_plugged is redundant,
- * implicitly checked in blk_plug_device */
-
- if (!blk_queue_plugged(q)) {
- blk_plug_device(q);
- del_timer(&q->unplug_timer);
- /* unplugging should not happen automatically... */
- }
- spin_unlock_irq(q->queue_lock);
-}
-
-static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
-{
- return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
- == CRYPTO_ALG_TYPE_HASH;
-}
-
-#ifndef __CHECKER__
-# undef __cond_lock
-# define __cond_lock(x,c) (c)
-#endif
-
-#endif