diff options
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 41 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 216 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 593 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 270 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 34 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 946 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 165 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 62 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 292 |
10 files changed, 1733 insertions, 888 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 9400845d602..ac04ef97eac 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, * ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ - spin_lock_irqsave(&mdev->al_lock, flags); count = drbd_bm_clear_bits(mdev, sbnr, ebnr); - if (count) { - /* we need the lock for drbd_try_clear_on_disk_bm */ - if (jiffies - mdev->rs_mark_time > HZ*10) { - /* should be rolling marks, - * but we estimate only anyways. */ - if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + if (count && get_ldev(mdev)) { + unsigned long now = jiffies; + unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; + int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + unsigned long tw = drbd_bm_total_weight(mdev); + if (mdev->rs_mark_left[mdev->rs_last_mark] != tw && mdev->state.conn != C_PAUSED_SYNC_T && mdev->state.conn != C_PAUSED_SYNC_S) { - mdev->rs_mark_time = jiffies; - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_mark_time[next] = now; + mdev->rs_mark_left[next] = tw; + mdev->rs_last_mark = next; } } - if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); - put_ldev(mdev); - } + spin_lock_irqsave(&mdev->al_lock, flags); + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + spin_unlock_irqrestore(&mdev->al_lock, flags); + /* just wake_up unconditional now, various lc_chaged(), * lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up = 1; + put_ldev(mdev); } - spin_unlock_irqrestore(&mdev->al_lock, flags); if (wake_up) wake_up(&mdev->al_wait); } @@ -1118,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) * @mdev: DRBD device. * @sector: The sector number. * - * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. */ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) { @@ -1129,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) - return 0; + return -EINTR; if (test_bit(BME_LOCKED, &bm_ext->flags)) - return 1; + return 0; for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(mdev->al_wait, @@ -1145,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) wake_up(&mdev->al_wait); } spin_unlock_irq(&mdev->al_lock); - return 0; + return -EINTR; } } - set_bit(BME_LOCKED, &bm_ext->flags); - - return 1; + return 0; } /** diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e3f88d6e141..fd42832f785 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) * * maybe bm_set should be atomic_t ? */ -static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; unsigned long s; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 352441b0f92..c07c370c4c8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) * NOTE that the payload starts at a long aligned offset, * regardless of 32 or 64 bit arch! */ -struct p_header { +struct p_header80 { u32 magic; u16 command; u16 length; /* bytes of data after this header */ u8 payload[0]; } __packed; -/* 8 bytes. packet FIXED for the next century! */ + +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ + u8 payload[0]; +} __packed; + +union p_header { + struct p_header80 h80; + struct p_header95 h95; +}; /* * short commands, packets without payload, plain p_header: @@ -362,12 +374,16 @@ struct p_header { */ /* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 -#define DP_RW_SYNC 2 +#define DP_HARDBARRIER 1 /* depricated */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ struct p_data { - struct p_header head; + union p_header head; u64 sector; /* 64 bits sector number */ u64 block_id; /* to identify the request in protocol B&C */ u32 seq_num; @@ -383,7 +399,7 @@ struct p_data { * P_DATA_REQUEST, P_RS_DATA_REQUEST */ struct p_block_ack { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -392,7 +408,7 @@ struct p_block_ack { struct p_block_req { - struct p_header head; + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -409,7 +425,7 @@ struct p_block_req { */ struct p_handshake { - struct p_header head; /* 8 bytes */ + struct p_header80 head; /* 8 bytes */ u32 protocol_min; u32 feature_flags; u32 protocol_max; @@ -424,19 +440,19 @@ struct p_handshake { /* 80 bytes, FIXED for the next century */ struct p_barrier { - struct p_header head; + struct p_header80 head; u32 barrier; /* barrier number _handle_ only */ u32 pad; /* to multiple of 8 Byte */ } __packed; struct p_barrier_ack { - struct p_header head; + struct p_header80 head; u32 barrier; u32 set_size; } __packed; struct p_rs_param { - struct p_header head; + struct p_header80 head; u32 rate; /* Since protocol version 88 and higher. */ @@ -444,20 +460,31 @@ struct p_rs_param { } __packed; struct p_rs_param_89 { - struct p_header head; + struct p_header80 head; u32 rate; /* protocol version 89: */ char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; } __packed; +struct p_rs_param_95 { + struct p_header80 head; + u32 rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + enum drbd_conn_flags { CF_WANT_LOSE = 1, CF_DRY_RUN = 2, }; struct p_protocol { - struct p_header head; + struct p_header80 head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; @@ -471,17 +498,17 @@ struct p_protocol { } __packed; struct p_uuids { - struct p_header head; + struct p_header80 head; u64 uuid[UI_EXTENDED_SIZE]; } __packed; struct p_rs_uuid { - struct p_header head; + struct p_header80 head; u64 uuid; } __packed; struct p_sizes { - struct p_header head; + struct p_header80 head; u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ @@ -491,18 +518,18 @@ struct p_sizes { } __packed; struct p_state { - struct p_header head; + struct p_header80 head; u32 state; } __packed; struct p_req_state { - struct p_header head; + struct p_header80 head; u32 mask; u32 val; } __packed; struct p_req_state_reply { - struct p_header head; + struct p_header80 head; u32 retcode; } __packed; @@ -517,7 +544,7 @@ struct p_drbd06_param { } __packed; struct p_discard { - struct p_header head; + struct p_header80 head; u64 block_id; u32 seq_num; u32 pad; @@ -533,7 +560,7 @@ enum drbd_bitmap_code { }; struct p_compressed_bm { - struct p_header head; + struct p_header80 head; /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code * (encoding & 0x80): polarity (set/unset) of first runlength * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits @@ -544,10 +571,10 @@ struct p_compressed_bm { u8 code[0]; } __packed; -struct p_delay_probe { - struct p_header head; - u32 seq_num; /* sequence number to match the two probe packets */ - u32 offset; /* usecs the probe got sent after the reference time point */ +struct p_delay_probe93 { + struct p_header80 head; + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ } __packed; /* DCBP: Drbd Compressed Bitmap Packet ... */ @@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) * so we need to use the fixed size 4KiB page size * most architechtures have used for a long time. */ -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) #if (PAGE_SIZE < 4096) @@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n) #endif union p_polymorph { - struct p_header header; + union p_header header; struct p_handshake handshake; struct p_data data; struct p_block_ack block_ack; struct p_barrier barrier; struct p_barrier_ack barrier_ack; struct p_rs_param_89 rs_param_89; + struct p_rs_param_95 rs_param_95; struct p_protocol protocol; struct p_sizes sizes; struct p_uuids uuids; @@ -617,6 +645,8 @@ union p_polymorph { struct p_req_state req_state; struct p_req_state_reply req_state_reply; struct p_block_req block_req; + struct p_delay_probe93 delay_probe93; + struct p_rs_uuid rs_uuid; } __packed; /**********************************************************************/ @@ -697,7 +727,7 @@ struct drbd_tl_epoch { struct list_head requests; /* requests before */ struct drbd_tl_epoch *next; /* pointer to the next barrier */ unsigned int br_number; /* the barriers identifier. */ - int n_req; /* number of requests attached before this barrier */ + int n_writes; /* number of requests attached before this barrier */ }; struct drbd_request; @@ -747,7 +777,7 @@ struct digest_info { struct drbd_epoch_entry { struct drbd_work w; struct hlist_node colision; - struct drbd_epoch *epoch; + struct drbd_epoch *epoch; /* for writes */ struct drbd_conf *mdev; struct page *pages; atomic_t pending_bios; @@ -755,7 +785,10 @@ struct drbd_epoch_entry { /* see comments on ee flag bits below */ unsigned long flags; sector_t sector; - u64 block_id; + union { + u64 block_id; + struct digest_info *digest; + }; }; /* ee flag bits. @@ -781,12 +814,16 @@ enum { * if any of those fail, we set this flag atomically * from the endio callback */ __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) /* global flag bits */ enum { @@ -794,7 +831,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - STOP_SYNC_TIMER, /* tell timer to cancel itself */ UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ @@ -816,6 +852,7 @@ enum { BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ + GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -829,6 +866,8 @@ enum { * the peer, if it changed there as well. */ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -838,10 +877,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */ /* THINK maybe we actually want to use the default "event/%s" worker threads * or similar in linux 2.6, which uses per cpu data and threads. - * - * To be general, this might need a spin_lock member. - * For now, please use the mdev->req_lock to protect list_head, - * see drbd_queue_work below. */ struct drbd_work_queue { struct list_head q; @@ -915,6 +950,12 @@ enum write_ordering_e { WO_bio_barrier }; +struct fifo_buffer { + int *values; + unsigned int head_index; + unsigned int size; +}; + struct drbd_conf { /* things that are stored as / read from meta data on disk */ unsigned long flags; @@ -936,9 +977,16 @@ struct drbd_conf { unsigned int ko_count; struct drbd_work resync_work, unplug_work, + go_diskless, md_sync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; +#ifdef DRBD_DEBUG_MD_SYNC + struct { + unsigned int line; + const char* func; + } last_md_mark_dirty; +#endif /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; @@ -946,6 +994,7 @@ struct drbd_conf { union drbd_state state; wait_queue_head_t misc_wait; wait_queue_head_t state_wait; /* upon each state change. */ + wait_queue_head_t net_cnt_wait; unsigned int send_cnt; unsigned int recv_cnt; unsigned int read_cnt; @@ -974,12 +1023,16 @@ struct drbd_conf { unsigned long rs_start; /* cumulated time in PausedSyncX state [unit jiffies] */ unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ - unsigned long rs_mark_left; + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; /* marks's time [unit jiffies] */ - unsigned long rs_mark_time; - /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ - unsigned long rs_same_csum; + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; /* where does the admin want us to start? (sector) */ sector_t ov_start_sector; @@ -1012,10 +1065,10 @@ struct drbd_conf { spinlock_t epoch_lock; unsigned int epochs; enum write_ordering_e write_ordering; - struct list_head active_ee; /* IO in progress */ - struct list_head sync_ee; /* IO in progress */ + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* send ack */ - struct list_head read_ee; /* IO in progress */ + struct list_head read_ee; /* IO in progress (any read) */ struct list_head net_ee; /* zero-copy network send in progress */ struct hlist_head *ee_hash; /* is proteced by req_lock! */ unsigned int ee_hash_s; @@ -1026,7 +1079,8 @@ struct drbd_conf { int next_barrier_nr; struct hlist_head *app_reads_hash; /* is proteced by req_lock */ struct list_head resync_reads; - atomic_t pp_in_use; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ @@ -1054,6 +1108,15 @@ struct drbd_conf { u64 ed_uuid; /* UUID of the exposed data */ struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ + struct fifo_buffer rs_plan_s; /* correction values of resync planer */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + int rs_planed; /* resync sectors already planed */ }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1138,6 +1201,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev); extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, unsigned int set_size); extern void tl_clear(struct drbd_conf *mdev); +enum drbd_req_event; +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); extern void drbd_free_sock(struct drbd_conf *mdev); extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, @@ -1150,12 +1215,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f extern int _drbd_send_state(struct drbd_conf *mdev); extern int drbd_send_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); #define USE_DATA_SOCKET 1 #define USE_META_SOCKET 0 extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, + enum drbd_packets cmd, struct p_header80 *h, size_t size); extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, size_t size); @@ -1167,7 +1232,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, struct p_block_req *rp); extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp); + struct p_data *dp, int data_size); extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id); extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, @@ -1201,7 +1266,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +#ifndef DRBD_DEBUG_MD_SYNC extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +#else +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev, + unsigned int line, const char *func); +#endif extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), void (*done)(struct drbd_conf *, int), @@ -1209,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); +extern void drbd_go_diskless(struct drbd_conf *mdev); /* Meta data layout @@ -1264,6 +1336,8 @@ struct bm_extent { * Bit 1 ==> local node thinks this block needs to be synced. */ +#define SLEEP_TIME (HZ/10) + #define BM_BLOCK_SHIFT 12 /* 4k per bit */ #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) /* (9+3) : 512 bytes @ 8 bits; representing 16M storage @@ -1335,11 +1409,13 @@ struct bm_extent { #endif /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. - * With a value of 6 all IO in one 32K block make it to the same slot of the + * With a value of 8 all IO in one 128K block make it to the same slot of the * hash table. */ -#define HT_SHIFT 6 +#define HT_SHIFT 8 #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ + /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 @@ -1369,6 +1445,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_ /* bm_find_next variants for use while you hold drbd_bm_lock() */ extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev); extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); extern int drbd_bm_rs_done(struct drbd_conf *mdev); /* for receive_bitmap */ @@ -1421,7 +1498,8 @@ extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force); -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); /* drbd_worker.c */ @@ -1467,10 +1545,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); /* drbd_receiver.c */ +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev); extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, const unsigned rw, const int fault_type); extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); @@ -1479,7 +1559,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, sector_t sector, unsigned int data_size, gfp_t gfp_mask) __must_hold(local); -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + int is_net); +#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head); extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, @@ -1487,6 +1570,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); extern void drbd_flush_workqueue(struct drbd_conf *mdev); +extern void drbd_free_tl_hash(struct drbd_conf *mdev); /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ @@ -1600,6 +1684,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev, #define susp_MASK 1 #define user_isp_MASK 1 #define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 #define NS(T, S) \ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ @@ -1856,13 +1942,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, } static inline void -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) -{ - list_add_tail(&w->list, &q->q); - up(&q->s); -} - -static inline void drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; @@ -1899,19 +1978,19 @@ static inline void request_ping(struct drbd_conf *mdev) static inline int drbd_send_short_cmd(struct drbd_conf *mdev, enum drbd_packets cmd) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); } static inline int drbd_send_ping(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); } static inline int drbd_send_ping_ack(struct drbd_conf *mdev) { - struct p_header h; + struct p_header80 h; return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); } @@ -2013,7 +2092,7 @@ static inline void inc_unacked(struct drbd_conf *mdev) static inline void put_net_conf(struct drbd_conf *mdev) { if (atomic_dec_and_test(&mdev->net_cnt)) - wake_up(&mdev->misc_wait); + wake_up(&mdev->net_cnt_wait); } /** @@ -2044,10 +2123,14 @@ static inline int get_net_conf(struct drbd_conf *mdev) static inline void put_ldev(struct drbd_conf *mdev) { + int i = atomic_dec_return(&mdev->local_cnt); __release(local); - if (atomic_dec_and_test(&mdev->local_cnt)) + D_ASSERT(i >= 0); + if (i == 0) { + if (mdev->state.disk == D_FAILED) + drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); - D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); + } } #ifndef __CHECKER__ @@ -2179,11 +2262,16 @@ static inline int drbd_state_is_stable(union drbd_state s) return 1; } +static inline int is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) { int mxb = drbd_get_max_buffers(mdev); - if (mdev->state.susp) + if (is_susp(mdev->state)) return 0; if (test_bit(SUSPEND_IO, &mdev->flags)) return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e4b56119866..c5dfe6486cf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -78,6 +78,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -200,7 +201,7 @@ static int tl_init(struct drbd_conf *mdev) INIT_LIST_HEAD(&b->w.list); b->next = NULL; b->br_number = 4711; - b->n_req = 0; + b->n_writes = 0; b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ mdev->oldest_tle = b; @@ -241,7 +242,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) INIT_LIST_HEAD(&new->w.list); new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ new->next = NULL; - new->n_req = 0; + new->n_writes = 0; newest_before = mdev->newest_tle; /* never send a barrier number == 0, because that is special-cased @@ -285,9 +286,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, barrier_nr, b->br_number); goto bail; } - if (b->n_req != set_size) { - dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", - barrier_nr, set_size, b->n_req); + if (b->n_writes != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, b->n_writes); goto bail; } @@ -334,6 +335,82 @@ bail: drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); } +/** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @mdev: DRBD device. + * @what: The action/event to perform with all request objects + * + * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, + * restart_frozen_disk_io. + */ +static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + struct drbd_tl_epoch *b, *tmp, **pn; + struct list_head *le, *tle, carry_reads; + struct drbd_request *req; + int rv, n_writes, n_reads; + + b = mdev->oldest_tle; + pn = &mdev->oldest_tle; + while (b) { + n_writes = 0; + n_reads = 0; + INIT_LIST_HEAD(&carry_reads); + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + rv = _req_mod(req, what); + + n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; + n_reads += (rv & MR_READ) >> MR_READ_SHIFT; + } + tmp = b->next; + + if (n_writes) { + if (what == resend) { + b->n_writes = n_writes; + if (b->w.cb == NULL) { + b->w.cb = w_send_barrier; + inc_ap_pending(mdev); + set_bit(CREATE_BARRIER, &mdev->flags); + } + + drbd_queue_work(&mdev->data.work, &b->w); + } + pn = &b->next; + } else { + if (n_reads) + list_add(&carry_reads, &b->requests); + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + list_splice(&carry_reads, &b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = net_random(); + b->n_writes = 0; + + *pn = b; + break; + } + *pn = tmp; + kfree(b); + } + b = tmp; + list_splice(&carry_reads, &b->requests); + } +} + /** * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL @@ -345,48 +422,12 @@ bail: */ void tl_clear(struct drbd_conf *mdev) { - struct drbd_tl_epoch *b, *tmp; struct list_head *le, *tle; struct drbd_request *r; - int new_initial_bnr = net_random(); spin_lock_irq(&mdev->req_lock); - b = mdev->oldest_tle; - while (b) { - list_for_each_safe(le, tle, &b->requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); - } - tmp = b->next; - - /* there could still be requests on that ring list, - * in case local io is still pending */ - list_del(&b->requests); - - /* dec_ap_pending corresponding to queue_barrier. - * the newest barrier may not have been queued yet, - * in which case w.cb is still NULL. */ - if (b->w.cb != NULL) - dec_ap_pending(mdev); - - if (b == mdev->newest_tle) { - /* recycle, but reinit! */ - D_ASSERT(tmp == NULL); - INIT_LIST_HEAD(&b->requests); - INIT_LIST_HEAD(&b->w.list); - b->w.cb = NULL; - b->br_number = new_initial_bnr; - b->n_req = 0; - - mdev->oldest_tle = b; - break; - } - kfree(b); - b = tmp; - } + _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); @@ -402,6 +443,15 @@ void tl_clear(struct drbd_conf *mdev) /* ensure bit indicating barrier is required is clear */ clear_bit(CREATE_BARRIER, &mdev->flags); + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + + spin_unlock_irq(&mdev->req_lock); +} + +void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); spin_unlock_irq(&mdev->req_lock); } @@ -456,7 +506,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); static int is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort); + union drbd_state ns, const char **warn_sync_abort); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -606,7 +656,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) drbd_role_str(ns.peer), drbd_disk_str(ns.disk), drbd_disk_str(ns.pdsk), - ns.susp ? 's' : 'r', + is_susp(ns) ? 's' : 'r', ns.aftr_isp ? 'a' : '-', ns.peer_isp ? 'p' : '-', ns.user_isp ? 'u' : '-' @@ -764,7 +814,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort) + union drbd_state ns, const char **warn_sync_abort) { enum drbd_fencing_p fp; @@ -779,9 +829,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state |