diff options
Diffstat (limited to 'drivers/block/drbd/drbd_int.h')
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 2252 |
1 files changed, 2252 insertions, 0 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 00000000000..2312d782fe9 --- /dev/null +++ b/drivers/block/drbd/drbd_int.h @@ -0,0 +1,2252 @@ +/* + drbd_int.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/version.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/crypto.h> +#include <linux/ratelimit.h> +#include <linux/tcp.h> +#include <linux/mutex.h> +#include <linux/major.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> +#include <net/tcp.h> +#include <linux/lru_cache.h> + +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) + +/* module parameter, defined in drbd_main.c */ +extern unsigned int minor_count; +extern int disable_sendpage; +extern int allow_oos; +extern unsigned int cn_idx; + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +extern char usermode_helper[]; + + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + */ +#define DRBD_SIGKILL SIGHUP + +/* All EEs on the free list should have ID_VACANT (== 0) + * freshly allocated EEs get !ID_VACANT (== 1) + * so if it says "cannot dereference null pointer at adress 0x00000001", + * it is most likely one of these :( */ + +#define ID_IN_SYNC (4711ULL) +#define ID_OUT_OF_SYNC (4712ULL) + +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 +#define is_syncer_block_id(id) ((id) == ID_SYNCER) + +struct drbd_conf; + + +/* to shorten dev_warn(DEV, "msg"); and relatives statements */ +#define DEV (disk_to_dev(mdev->vdisk)) + +#define D_ASSERT(exp) if (!(exp)) \ + dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) + +#define ERR_IF(exp) if (({ \ + int _b = (exp) != 0; \ + if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ + __func__, #exp, __FILE__, __LINE__); \ + _b; \ + })) + +/* Defines to control fault insertion */ +enum { + DRBD_FAULT_MD_WR = 0, /* meta data write */ + DRBD_FAULT_MD_RD = 1, /* read */ + DRBD_FAULT_RS_WR = 2, /* resync */ + DRBD_FAULT_RS_RD = 3, + DRBD_FAULT_DT_WR = 4, /* data */ + DRBD_FAULT_DT_RD = 5, + DRBD_FAULT_DT_RA = 6, /* data read ahead */ + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ + DRBD_FAULT_AL_EE = 8, /* alloc ee */ + + DRBD_FAULT_MAX, +}; + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern unsigned int +_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); +static inline int +drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { + return fault_rate && + (enable_faults & (1<<type)) && + _drbd_insert_fault(mdev, type); +} +#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) + +#else +#define FAULT_ACTIVE(_m, _t) (0) +#endif + +/* integer division, round _UP_ to the next integer */ +#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) +/* usual integer division */ +#define div_floor(A, B) ((A)/(B)) + +/* drbd_meta-data.c (still in drbd_main.c) */ +/* 4th incarnation of the disk layout. */ +#define DRBD_MD_MAGIC (DRBD_MAGIC+4) + +extern struct drbd_conf **minor_table; +extern struct ratelimit_state drbd_ratelimit_state; + +/* on the wire */ +enum drbd_packets { + /* receiver (data socket) */ + P_DATA = 0x00, + P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ + P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ + P_BARRIER = 0x03, + P_BITMAP = 0x04, + P_BECOME_SYNC_TARGET = 0x05, + P_BECOME_SYNC_SOURCE = 0x06, + P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ + P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ + P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ + P_SYNC_PARAM = 0x0a, + P_PROTOCOL = 0x0b, + P_UUIDS = 0x0c, + P_SIZES = 0x0d, + P_STATE = 0x0e, + P_SYNC_UUID = 0x0f, + P_AUTH_CHALLENGE = 0x10, + P_AUTH_RESPONSE = 0x11, + P_STATE_CHG_REQ = 0x12, + + /* asender (meta socket */ + P_PING = 0x13, + P_PING_ACK = 0x14, + P_RECV_ACK = 0x15, /* Used in protocol B */ + P_WRITE_ACK = 0x16, /* Used in protocol C */ + P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ + P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ + P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ + P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ + P_BARRIER_ACK = 0x1c, + P_STATE_CHG_REPLY = 0x1d, + + /* "new" commands, no longer fitting into the ordering scheme above */ + + P_OV_REQUEST = 0x1e, /* data socket */ + P_OV_REPLY = 0x1f, + P_OV_RESULT = 0x20, /* meta socket */ + P_CSUM_RS_REQUEST = 0x21, /* data socket */ + P_RS_IS_IN_SYNC = 0x22, /* meta socket */ + P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ + P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + + P_MAX_CMD = 0x25, + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ + P_MAX_OPT_CMD = 0x101, + + /* special command ids for handshake */ + + P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ + P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ + + P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ +}; + +static inline const char *cmdname(enum drbd_packets cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_DISCARD_ACK] = "DiscardAck", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_MAX_CMD] = NULL, + }; + + if (cmd == P_HAND_SHAKE_M) + return "HandShakeM"; + if (cmd == P_HAND_SHAKE_S) + return "HandShakeS"; + if (cmd == P_HAND_SHAKE) + return "HandShake"; + if (cmd >= P_MAX_CMD) + return "Unknown"; + return cmdnames[cmd]; +} + +/* for sending/receiving the bitmap, + * possibly in some encoding scheme */ +struct bm_xfer_ctx { + /* "const" + * stores total bits and long words + * of the bitmap, so we don't need to + * call the accessor functions over and again. */ + unsigned long bm_bits; + unsigned long bm_words; + /* during xfer, current position within the bitmap */ + unsigned long bit_offset; + unsigned long word_offset; + + /* statistics; index: (h->command == P_BITMAP) */ + unsigned packets[2]; + unsigned bytes[2]; +}; + +extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, + const char *direction, struct bm_xfer_ctx *c); + +static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) +{ + /* word_offset counts "native long words" (32 or 64 bit), + * aligned at 64 bit. + * Encoded packet may end at an unaligned bit offset. + * In case a fallback clear text packet is transmitted in + * between, we adjust this offset back to the last 64bit + * aligned "native long word", which makes coding and decoding + * the plain text bitmap much more convenient. */ +#if BITS_PER_LONG == 64 + c->word_offset = c->bit_offset >> 6; +#elif BITS_PER_LONG == 32 + c->word_offset = c->bit_offset >> 5; + c->word_offset &= ~(1UL); +#else +# error "unsupported BITS_PER_LONG" +#endif +} + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +struct p_header { + u32 magic; + u16 command; + u16 length; /* bytes of data after this header */ + u8 payload[0]; +} __packed; +/* 8 bytes. packet FIXED for the next century! */ + +/* + * short commands, packets without payload, plain p_header: + * P_PING + * P_PING_ACK + * P_BECOME_SYNC_TARGET + * P_BECOME_SYNC_SOURCE + * P_UNPLUG_REMOTE + */ + +/* + * commands with out-of-struct payload: + * P_BITMAP (no additional fields) + * P_DATA, P_DATA_REPLY (see p_data) + * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) + */ + +/* these defines must not be changed without changing the protocol version */ +#define DP_HARDBARRIER 1 +#define DP_RW_SYNC 2 +#define DP_MAY_SET_IN_SYNC 4 + +struct p_data { + struct p_header head; + u64 sector; /* 64 bits sector number */ + u64 block_id; /* to identify the request in protocol B&C */ + u32 seq_num; + u32 dp_flags; +} __packed; + +/* + * commands which share a struct: + * p_block_ack: + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), + * P_DISCARD_ACK (proto C, two-primaries conflict detection) + * p_block_req: + * P_DATA_REQUEST, P_RS_DATA_REQUEST + */ +struct p_block_ack { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __packed; + + +struct p_block_req { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* + * commands with their own struct for additional fields: + * P_HAND_SHAKE + * P_BARRIER + * P_BARRIER_ACK + * P_SYNC_PARAM + * ReportParams + */ + +struct p_handshake { + struct p_header head; /* 8 bytes */ + u32 protocol_min; + u32 feature_flags; + u32 protocol_max; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u32 _pad; + u64 reserverd[7]; +} __packed; +/* 80 bytes, FIXED for the next century */ + +struct p_barrier { + struct p_header head; + u32 barrier; /* barrier number _handle_ only */ + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +struct p_barrier_ack { + struct p_header head; + u32 barrier; + u32 set_size; +} __packed; + +struct p_rs_param { + struct p_header head; + u32 rate; + + /* Since protocol version 88 and higher. */ + char verify_alg[0]; +} __packed; + +struct p_rs_param_89 { + struct p_header head; + u32 rate; + /* protocol version 89: */ + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; +} __packed; + +struct p_protocol { + struct p_header head; + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 want_lose; + u32 two_primaries; + + /* Since protocol version 87 and higher. */ + char integrity_alg[0]; + +} __packed; + +struct p_uuids { + struct p_header head; + u64 uuid[UI_EXTENDED_SIZE]; +} __packed; + +struct p_rs_uuid { + struct p_header head; + u64 uuid; +} __packed; + +struct p_sizes { + struct p_header head; + u64 d_size; /* size of disk */ + u64 u_size; /* user requested size */ + u64 c_size; /* current exported size */ + u32 max_segment_size; /* Maximal size of a BIO */ + u32 queue_order_type; +} __packed; + +struct p_state { + struct p_header head; + u32 state; +} __packed; + +struct p_req_state { + struct p_header head; + u32 mask; + u32 val; +} __packed; + +struct p_req_state_reply { + struct p_header head; + u32 retcode; +} __packed; + +struct p_drbd06_param { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __packed; + +struct p_discard { + struct p_header head; + u64 block_id; + u32 seq_num; + u32 pad; +} __packed; + +/* Valid values for the encoding field. + * Bump proto version when changing this. */ +enum drbd_bitmap_code { + /* RLE_VLI_Bytes = 0, + * and other bit variants had been defined during + * algorithm evaluation. */ + RLE_VLI_Bits = 2, +}; + +struct p_compressed_bm { + struct p_header head; + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code + * (encoding & 0x80): polarity (set/unset) of first runlength + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits + * used to pad up to head.length bytes + */ + u8 encoding; + + u8 code[0]; +} __packed; + +/* DCBP: Drbd Compressed Bitmap Packet ... */ +static inline enum drbd_bitmap_code +DCBP_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static inline void +DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static inline int +DCBP_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static inline void +DCBP_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static inline int +DCBP_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +static inline void +DCBP_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +/* one bitmap packet, including the p_header, + * should fit within one _architecture independend_ page. + * so we need to use the fixed size 4KiB page size + * most architechtures have used for a long time. + */ +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) +#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) +#if (PAGE_SIZE < 4096) +/* drbd_send_bitmap / receive_bitmap would break horribly */ +#error "PAGE_SIZE too small" +#endif + +union p_polymorph { + struct p_header header; + struct p_handshake handshake; + struct p_data data; + struct p_block_ack block_ack; + struct p_barrier barrier; + struct p_barrier_ack barrier_ack; + struct p_rs_param_89 rs_param_89; + struct p_protocol protocol; + struct p_sizes sizes; + struct p_uuids uuids; + struct p_state state; + struct p_req_state req_state; + struct p_req_state_reply req_state_reply; + struct p_block_req block_req; +} __packed; + +/**********************************************************************/ +enum drbd_thread_state { + None, + Running, + Exiting, + Restarting +}; + +struct drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion stop; + enum drbd_thread_state t_state; + int (*function) (struct drbd_thread *); + struct drbd_conf *mdev; + int reset_cpu_mask; +}; + +static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and drbd_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +struct drbd_tl_epoch; +struct drbd_request { + struct drbd_work w; + struct drbd_conf *mdev; + + /* if local IO is not allowed, will be NULL. + * if local IO _is_ allowed, holds the locally submitted bio clone, + * or, after local IO completion, the ERR_PTR(error). + * see drbd_endio_pri(). */ + struct bio *private_bio; + + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ + + /* barrier_nr: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * starting a new epoch... + */ + + /* up to here, the struct layout is identical to drbd_epoch_entry; + * we might be able to use that to our advantage... */ + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; + unsigned long start_time; +}; + +struct drbd_tl_epoch { + struct drbd_work w; + struct list_head requests; /* requests before */ + struct drbd_tl_epoch *next; /* pointer to the next barrier */ + unsigned int br_number; /* the barriers identifier. */ + int n_req; /* number of requests attached before this barrier */ +}; + +struct drbd_request; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send P_WRITE_ACK + read_ee .. [RS]P_DATA_REQUEST being read +*/ + +struct drbd_epoch { + struct list_head list; + unsigned int barrier_nr; + atomic_t epoch_size; /* increased on every request added. */ + atomic_t active; /* increased on every req. added, and dec on every finished. */ + unsigned long flags; +}; + +/* drbd_epoch flag bits */ +enum { + DE_BARRIER_IN_NEXT_EPOCH_ISSUED, + DE_BARRIER_IN_NEXT_EPOCH_DONE, + DE_CONTAINS_A_BARRIER, + DE_HAVE_BARRIER_NUMBER, + DE_IS_FINISHING, +}; + +enum epoch_event { + EV_PUT, + EV_GOT_BARRIER_NR, + EV_BARRIER_DONE, + EV_BECAME_LAST, + EV_CLEANUP = 32, /* used as flag */ +}; + +struct drbd_epoch_entry { + struct drbd_work w; + struct drbd_conf *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + struct drbd_epoch *epoch; + + /* up to here, the struct layout is identical to drbd_request; + * we might be able to use that to our advantage... */ + + unsigned int flags; + u64 block_id; +}; + +struct drbd_wq_barrier { + struct drbd_work w; + struct completion done; +}; + +struct digest_info { + int digest_size; + void *digest; +}; + +/* ee flag bits */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_CONFLICT_PENDING, + __EE_MAY_SET_IN_SYNC, + __EE_IS_BARRIER, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) +#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) + +/* global flag bits */ +enum { + CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, /* whether asender should send a ping asap */ + + STOP_SYNC_TIMER, /* tell timer to cancel itself */ + UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ + MD_DIRTY, /* current uuids and flags not yet on disk */ + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, /* This node was a crashed primary. + * Gets cleared when the state.conn + * goes into C_CONNECTED state. */ + WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ + NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ + CONSIDER_RESYNC, + + MD_NO_BARRIER, /* meta data device does not support barriers, + so don't even try */ + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + NET_CONGESTED, /* The data socket is congested */ + + CONFIG_PENDING, /* serialization of (re)configuration requests. + * if set, also prevents the device from dying */ + DEVICE_DYING, /* device became unconfigured, + * but worker thread is still handling the cleanup. + * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, + * while this is set. */ + RESIZE_PENDING, /* Size change detected locally, waiting for the response from + * the peer, if it changed there as well. */ +}; + +struct drbd_bitmap; /* opaque for drbd_conf */ + +/* TODO sort members for performance + * MAYBE group them further */ + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; /* producers up it, worker down()s it */ + spinlock_t q_lock; /* to protect the list. */ +}; + +struct drbd_socket { + struct drbd_work_queue work; + struct mutex mutex; + struct socket *socket; + /* this way we get our + * send/receive buffers off the stack */ + union p_polymorph sbuf; + union p_polymorph rbuf; +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + u64 uuid[UI_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to al area */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* u32 al_nr_extents; important for restoring the AL + * is stored into sync_conf.al_extents, which in turn + * gets applied to act_log->nr_elements + */ +}; + +/* for sync_conf and other types... */ +#define NL_PACKET(name, number, fields) struct name { fields }; +#define NL_INTEGER(pn,pr,member) int member; +#define NL_INT64(pn,pr,member) __u64 member; +#define NL_BIT(pn,pr,member) unsigned member:1; +#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include "linux/drbd_nl.h" + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct file *lo_file; + struct file *md_file; + struct drbd_md md; + struct disk_conf dc; /* The user provided config... */ + sector_t known_size; /* last known size of that backing device */ +}; + +struct drbd_md_io { + struct drbd_conf *mdev; + struct completion event; + int error; +}; + +struct bm_io_work { + struct drbd_work w; + char *why; + int (*io_fn)(struct drbd_conf *mdev); + void (*done)(struct drbd_conf *mdev, int rv); +}; + +enum write_ordering_e { + WO_none, + WO_drain_io, + WO_bdev_flush, + WO_bio_barrier +}; + +struct drbd_conf { + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ + struct syncer_conf sync_conf; + struct drbd_backing_dev *ldev __protected_by(local); + + sector_t p_size; /* partner's disk size */ + struct request_queue *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + struct drbd_work resync_work, + unplug_work, + md_sync_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + + /* Used after attach while negotiating new disk state. */ + union drbd_state new_state_tmp; + + union drbd_state state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; /* upon each state change. */ + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ + atomic_t unacked_cnt; /* Need to send replys for */ + atomic_t local_cnt; /* Waiting for local completion */ + atomic_t net_cnt; /* Users of net_conf */ + spinlock_t req_lock; + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ + struct drbd_tl_epoch *newest_tle; + struct drbd_tl_epoch *oldest_tle; + struct list_head out_of_sequence_requests; + struct hlist_head *tl_hash; + unsigned int tl_hash_s; + + /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ + unsigned long rs_total; + /* number of sync IOs that failed in this run */ + unsigned long rs_failed; + /* Syncer's start time [unit jiffies] */ + unsigned long rs_start; + /* cumulated time in PausedSyncX state [unit jiffies] */ + unsigned long rs_paused; + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ + unsigned long rs_mark_left; + /* marks's time [unit jiffies] */ + unsigned long rs_mark_time; + /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; + + /* where does the admin want us to start? (sector) */ + sector_t ov_start_sector; + /* where are we now? (sector) */ + sector_t ov_position; + /* Start sector of out of sync range (to merge printk reporting). */ + sector_t ov_last_oos_start; + /* size of out-of-sync range in sectors. */ + sector_t ov_last_oos_size; + unsigned long ov_left; /* in bits */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; + struct drbd_bitmap *bitmap; + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ + + /* Used to track operations of resync... */ + struct lru_cache *resync; + /* Number of locked elements in resync LRU */ + unsigned int resync_locked; + /* resync extent number waiting for application requests */ + unsigned int resync_wenr; + + int open_cnt; + u64 *p_uuid; + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + enum write_ordering_e write_ordering; + struct list_head active_ee; /* IO in progress */ + struct list_head sync_ee; /* IO in progress */ + struct list_head done_ee; /* send ack */ + struct list_head read_ee; /* IO in progress */ + struct list_head net_ee; /* zero-copy network send in progress */ + struct hlist_head *ee_hash; /* is proteced by req_lock! */ + unsigned int ee_hash_s; + + /* this one is protected by ee_lock, single thread */ + struct drbd_epoch_entry *last_write_w_barrier; + + int next_barrier_nr; + struct hlist_head *app_reads_hash; /* is proteced by req_lock */ + struct list_head resync_reads; + atomic_t pp_in_use; + wait_queue_head_t ee_wait; + struct page *md_io_page; /* one page buffer for md_io */ + struct page *md_io_tmpp; /* for logical_block_size != 512 */ + struct mutex md_io_mutex; /* protects the md_io_buffer */ + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache *act_log; /* activity log */ + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; /* position of the next transaction in the journal */ + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ + struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ + void *int_dig_out; + void *int_dig_in; + void *int_dig_vv; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + unsigned int minor; + unsigned long comm_bm_set; /* communicated number of set bits. */ + cpumask_var_t cpu_mask; + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex state_mutex; + char congestion_reason; /* Why we where congested... */ +}; + +static inline struct drbd_conf *minor_to_mdev(unsigned int minor) +{ + struct drbd_conf *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; +} + +static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) +{ + return mdev->minor; +} + +/* returns 1 if it was successfull, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(struct drbd_conf *mdev) +{ + mutex_lock(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(struct drbd_conf *mdev) +{ + mutex_unlock(&mdev->data.mutex); +} + +/* + * function declarations + *************************/ + +/* drbd_main.c */ + +enum chg_state_flags { + CS_HARD = 1, + CS_VERBOSE = 2, + CS_WAIT_COMPLETE = 4, + CS_SERIALIZE = 8, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, +}; + +extern void drbd_init_set_defaults(struct drbd_conf *mdev); +extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val); +extern void drbd_force_state(struct drbd_conf *, union drbd_state, + union drbd_state); +extern int _drbd_request_state(struct drbd_conf *, union drbd_state, + union drbd_state, enum chg_state_flags); +extern int __drbd_set_state(struct drbd_conf *, union drbd_state, + enum chg_state_flags, struct completion *done); +extern void print_st_err(struct drbd_conf *, union drbd_state, + union drbd_state, int); +extern int drbd_thread_start(struct drbd_thread *thi); +extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +#ifdef CONFIG_SMP +extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); +extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); +#else +#define drbd_thread_current_set_cpu(A) ({}) +#define drbd_calc_cpu_mask(A) ({}) +#endif +extern void drbd_free_resources(struct drbd_conf *mdev); +extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(struct drbd_conf *mdev); +extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); +extern void drbd_free_sock(struct drbd_conf *mdev); +extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(struct drbd_conf *mdev); +extern int drbd_send_uuids(struct drbd_conf *mdev); +extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); +extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); +extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); +extern int _drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev); +extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header *h, + size_t size); +extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, + char *data, size_t size); +extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp); +extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp); +extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, + sector_t sector, int blksiz |