From 383606e0dea6a380097dbcb0c319b09ca372f36b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 14:21:32 +0200 Subject: drbd: differentiate between normal and forced detach Aborting local requests (not waiting for completion from the lower level disk) is dangerous: if the master bio has been completed to upper layers, data pages may be re-used for other things already. If local IO is still pending and later completes, this may cause crashes or corrupt unrelated data. Only abort local IO if explicitly requested. Intended use case is a lower level device that turned into a tarpit, not completing io requests, not even doing error completion. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd/drbd_worker.c') diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 620c70ff223..a35393f2fd1 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -111,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); drbd_queue_work(&mdev->data.work, &e->w); @@ -154,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo : list_empty(&mdev->active_ee); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); if (is_syncer_req) -- cgit v1.2.3-18-g5258 From 0029d62434d9045bc3e8b2eb48ae696e30336e92 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 18:02:52 +0200 Subject: drbd: do not reset rs_pending_cnt too early Fix asserts like block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 ! We reset the resync lru cache and related information (rs_pending_cnt), once we successfully finished a resync or online verify, or if the replication connection is lost. We also need to reset it if a resync or online verify is aborted because a lower level disk failed. In that case the replication link is still established, and we may still have packets queued in the network buffers which want to touch rs_pending_cnt. We do not have any synchronization mechanism to know for sure when all such pending resync related packets have been drained. To avoid this counter to go negative (and violate the ASSERT that it will always be >= 0), just do not reset it when we lose a disk. It is good enough to make sure it is re-initialized before the next resync can start: reset it when we re-attach a disk. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers/block/drbd/drbd_worker.c') diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a35393f2fd1..6bce2cc179d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1501,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - if (mdev->state.conn < C_AHEAD) { - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ - drbd_rs_cancel_all(mdev); - /* This should be done when we abort the resync. We definitely do not - want to have this for connections going back and forth between - Ahead/Behind and SyncSource/SyncTarget */ - } - if (side == C_SYNC_TARGET) { /* Since application IO was locked out during C_WF_BITMAP_T and C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET -- cgit v1.2.3-18-g5258