diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 104 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 146 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 90 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 357 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 48 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 95 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 132 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 19 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 31 |
10 files changed, 741 insertions, 283 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index cf0e63dd97d..e54e31b02b8 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -65,39 +65,80 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + +static bool md_io_allowed(struct drbd_conf *mdev) +{ + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} + +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt = bdev->dc.disk_timeout * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) + dev_err(DEV, "meta-data IO operation timed out\n"); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; int ok; - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + mdev->md_io.done = 0; + mdev->md_io.error = -ENODEV; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + ok = 0; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); @@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 1; } @@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } @@ -534,7 +581,7 @@ cancel: mdev->al_tr_pos = 0; /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); @@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); - - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, @@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); return 0; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 3030201c69d..b5c5ff53cb5 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - page_private(page) |= idx; + set_page_private(page, idx); } static unsigned long bm_page_to_idx(struct page *page) @@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - struct completion done; + unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; + struct kref kref; }; +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + put_ldev(ctx->mdev); + kfree(ctx); +} + /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); - /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - put_page(bio->bi_io_vec[0].bv_page); + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) - complete(&ctx->done); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&mdev->misc_wait); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -966,10 +976,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - /* FIXME alloc_page is good enough for now, but actually needs - * to use pre-allocated page pool */ void *src, *dest; - page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); dest = kmap_atomic(page); src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); @@ -981,6 +989,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; @@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, - }; + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - if (!ctx.flags) + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id continue; } } - atomic_inc(&ctx.in_flight); - bm_page_io_async(&ctx, i, rw); + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx.in_flight to one to make sure bm_async_io_complete - * will not complete() early, and decrement / test it here. If there + * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. */ - if (!atomic_dec_and_test(&ctx.in_flight)) - wait_for_completion(&ctx.done); + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx.error) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx.error ? */ + err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0); + return bm_rw(mdev, READ, 0, 0); } /** @@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0); + return bm_rw(mdev, WRITE, 0, 0); } /** @@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, upper_idx); + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } @@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx ctx = { + struct bm_aio_ctx *ctx; + int err; + + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; + } + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), + .done = 0, .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, }; - if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { - dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); - return 0; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + kfree(ctx); + return -ENODEV; } - bm_page_io_async(&ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx.done); + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); - if (ctx.error) + if (ctx->error) drbd_chk_io_error(mdev, 1, true); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - return ctx.error; + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + return err; } /* NOTE diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8d680562ba7..02f013a073a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -712,7 +712,6 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ - int seq_num; unsigned long start_time; }; @@ -851,6 +850,7 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -862,31 +862,30 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0x7, + BM_LOCKED_MASK = 0xf, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = 0x3, + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = 0x1, + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, - /* clear is not expected while bitmap is locked for bulk operation */ + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, }; - -/* TODO sort members for performance - * MAYBE group them further */ - -/* THINK maybe we actually want to use the default "event/%s" worker threads - * or similar in linux 2.6, which uses per cpu data and threads. - */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ @@ -938,8 +937,7 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct drbd_conf *mdev; - struct completion event; + unsigned int done; int error; }; @@ -1022,6 +1020,7 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; @@ -1056,6 +1055,8 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; @@ -1094,7 +1095,8 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); +extern int drbd_send_current_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); @@ -1461,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); @@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); @@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); @@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } - static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; @@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 211fc44f84b..920ede2829d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - /* never send a barrier number == 0, because that is special-cased - * when using TCQ for our write ordering code */ - new->br_number = (newest_before->br_number+1) ?: 1; + new->br_number = newest_before->br_number+1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; @@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &mdev->barrier_acked_requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + switch (what) { + case fail_frozen_disk_io: + case restart_frozen_disk_io: + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } + + case connection_lost_while_pending: + case resend: + break; + default: + dev_err(DEV, "what = %d in _tl_restart()\n", what); + } } @@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } /** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + +/** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. * @os: old (current) state. @@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } @@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort); + union drbd_state ns, enum sanitize_state_warnings *warn); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort) + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + if (warn) + *warn = NO_WARNING; + fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; - /* if we are only D_ATTACHING yet, - * we can (and should) go directly to D_DISKLESS. */ - if (ns.disk == D_FAILED && os.disk == D_ATTACHING) - ns.disk = D_DISKLESS; - /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = - os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - "Online-verify" : "Resync"; + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; ns.conn = C_CONNECTED; } @@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - dev_warn(DEV, "Implicitly set disk from %s to %s\n", - drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", - drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; ns.pdsk = pdsk_min; } @@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - const char *warn_sync_abort = NULL; + enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + ns = sanitize_state(mdev, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - if (warn_sync_abort) - dev_warn(DEV, "%s aborted.\n", warn_sync_abort); + print_sanitize_warnings(mdev, ssw); { char *pbp, pb[300]; @@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && + if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); @@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) what = restart_frozen_disk_io; if (what != nothing) @@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } - /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. @@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - int was_io_error; + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS, - * so it is safe to dreference ldev here. */ - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I am detaching my disk\n"); - else - dev_err(DEV, "Sending state for detaching disk failed\n"); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } put_ldev(mdev); if (was_io_error && eh == EP_CALL_HELPER) @@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I'm now diskless.\n"); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) - drbd_send_state(mdev); + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, - "write from resync_finished", BM_LOCKED_SET_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(mdev); } @@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); @@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); @@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev) return ok; } +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { @@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); @@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; +#endif + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; + del_timer_sync(&mdev->request_timer); + /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, @@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev) * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; @@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef CONFIG_MODULES - if (THIS_MODULE != NULL) - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); - else +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; #endif - buildtag[0] = 'b'; } return buildtag; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index abfaacaaf34..867bf1d8298 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS) { + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } @@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); @@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) - peer = mdev->peer_max_bio_size; - else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) { + peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; @@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } @@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } + + if (dt.detach_force) { + drbd_force_state(mdev, NS(disk, D_FAILED)); + reply->ret_code = SS_SUCCESS; + goto out; + } + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; +out: return 0; } @@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail; + goto fail_ldev; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); return 0; } diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2959cdfb77f..869bada2ed0 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v /= 1000000; + v %= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 43beaca5317..1d088c47815 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); out: return err; @@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; + enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -888,25 +890,32 @@ retry: } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; - drbd_thread_start(&mdev->asender); - if (drbd_send_protocol(mdev) == -1) return -1; + set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + return 0; + + drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; @@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); + dev_info(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ @@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); @@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. */ + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { @@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) +{ + + struct drbd_epoch_entry *rs_e; + bool rv = 0; + + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&mdev->req_lock); + + return rv; +} + /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); @@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; @@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX) { - dev_err(DEV, "verify-alg too long, " - "peer wants %u, accepting only %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + dev_err(DEV, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } @@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* peer says his disk is uptodate, while we think it is inconsistent, - * and this happens while we think we have a sync going on. */ - if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it @@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } @@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); @@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - del_timer(&mdev->request_timer); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a0f314086e..9c5c84946b0 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); + part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; if (req->master_bio) { @@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) + if (req->rq_state & RQ_WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); + break; + + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; break; case write_completed_with_error: @@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_completed_with_error: @@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, false); - put_ldev(mdev); + + goto_queue_for_net_read: /* no point in retrying if there is no good remote data, * or we have no connection. */ @@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case oos_handed_to_network: - /* actually the same */ + case read_retry_remote_canceled: case send_canceled: - /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; - /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from - * _drbd_send_zc_bio (drbd_send_dblock), we have to check - * whether it is done already, and end it. */ _req_may_be_done_not_susp(req, m); break; - case read_retry_remote_canceled: + case oos_handed_to_network: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ req->rq_state &= ~RQ_NET_QUEUED; - /* fall through, in case we raced with drbd_disconnect */ + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); + break; + case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ @@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case write_acked_by_peer_and_sis: - req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ + case write_acked_by_peer_and_sis: case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing to do here. + * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. - * - * A barrier request is expected to have forced all prior - * requests onto stable storage, so completion of a barrier - * request could set NET_DONE right here, and not wait for the - * P_BARRIER_ACK, but that is an unnecessary optimization. */ + * for volatile write-back caches on lower level devices. */ - /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about @@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns drbd_al_begin_io(mdev, sector); } - remote = remote && drbd_should_do_remote(mdev->state); - send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { @@ -867,7 +871,7 @@ allocate_barrier: if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - generic_make_request() to restart processing of this + drbd_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) */ D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ @@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - drbd_make_request_common(mdev, bio, start_time); + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); return; } @@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long et = 0; /* effective timeout = ko_count * timeout */ + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; if (get_net_conf(mdev)) { - et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) + if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = mdev->ldev->dc.disk_timeout * HZ / 10; + put_ldev(mdev); + } + et = min_not_zero(dt, ent); + + if (!et) return; /* Recurring timer stopped */ + now = jiffies; + spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, jiffies + et); + mod_timer(&mdev->request_timer, now + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (time_is_before_eq_jiffies(req->start_time + et)) { - if (req->rq_state & RQ_NET_PENDING) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); - } else { - dev_warn(DEV, "Local backing block device frozen?\n"); - mod_timer(&mdev->request_timer, jiffies + et); - } - } else { - mod_timer(&mdev->request_timer, req->start_time + et); - } + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req->rq_state & RQ_NET_PENDING && + time_after(now, req->start_time + ent) && + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + } + if (dt && req->rq_state & RQ_LOCAL_PENDING && + time_after(now, req->start_time + dt) && + !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); + } + nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); + mod_timer(&mdev->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 68a234a5fdc..3d211191948 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,7 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, + abort_disk_io, completed_ok, resend, fail_frozen_disk_io, @@ -118,18 +119,21 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -199,8 +203,9 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d3e6f6213b..620c70ff223 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -70,11 +70,29 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; - complete(&md_io->event); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); + md_io->done = 1; + wake_up(&mdev->misc_wait); + bio_put(bio); + put_ldev(mdev); } /* reads on behalf of the partner, @@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); + put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); @@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } @@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } @@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); |