diff options
Diffstat (limited to 'drivers/md/raid5.h')
| -rw-r--r-- | drivers/md/raid5.h | 258 | 
1 files changed, 158 insertions, 100 deletions
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2ace0582b40..bc72cd4be5f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -6,11 +6,11 @@  /*   * - * Each stripe contains one buffer per disc.  Each buffer can be in + * Each stripe contains one buffer per device.  Each buffer can be in   * one of a number of states stored in "flags".  Changes between - * these states happen *almost* exclusively under a per-stripe - * spinlock.  Some very specific changes can happen in bi_end_io, and - * these are not protected by the spin lock. + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag.  Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE.   *   * The flag bits that are used to represent these states are:   *   R5_UPTODATE and R5_LOCKED @@ -27,7 +27,7 @@   * The possible state transitions are:   *   *  Empty -> Want   - on read or write to get old data for  parity calc - *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.   *  Empty -> Clean  - on compute_block when computing a block for failed drive   *  Want  -> Empty  - on failed read   *  Want  -> Clean  - on successful completion of read request @@ -49,7 +49,7 @@   * can't distinguish between a clean block that has been generated   * from parity calculations, and a clean block that has been   * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that + * To distinguish these states we have a stripe bit STRIPE_INSYNC that   * is set whenever a write is scheduled to the spare, or to the parity   * disc if there is no spare.  A sync request clears this bit, and   * when we find it set with no buffers locked, we know the sync is @@ -76,12 +76,10 @@   * block and the cached buffer are successfully written, any buffer on   * a written list can be returned with b_end_io.   * - * The write list and read list both act as fifos.  The read list is - * protected by the device_lock.  The write and written lists are - * protected by the stripe lock.  The device_lock, which can be - * claimed while the stipe lock is held, is only for list - * manipulations and will only be held for a very short time.  It can - * be claimed from interrupts. + * The write list and read list both act as fifos.  The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time.  It can be claimed from interrupts.   *   *   * Stripes in the stripe cache can be on one of two lists (or on @@ -96,7 +94,6 @@   *   * The inactive_list, handle_list and hash bucket lists are all protected by the   * device_lock. - *  - stripes on the inactive_list never have their stripe_lock held.   *  - stripes have a reference counter. If count==0, they are on a list.   *  - If a stripe might need handling, STRIPE_HANDLE is set.   *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on @@ -116,10 +113,10 @@   *  attach a request to an active stripe (add_stripe_bh())   *     lockdev attach-buffer unlockdev   *  handle a stripe (handle_stripe()) - *     lockstripe clrSTRIPE_HANDLE ... + *     setSTRIPE_ACTIVE,  clrSTRIPE_HANDLE ...   *		(lockdev check-buffers unlockdev) ..   *		change-state .. - *		record io/ops needed unlockstripe schedule io/ops + *		record io/ops needed clearSTRIPE_ACTIVE schedule io/ops   *  release an active stripe (release_stripe())   *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev   * @@ -128,8 +125,7 @@   * on a cached buffer, and plus one if the stripe is undergoing stripe   * operations.   * - * Stripe operations are performed outside the stripe lock, - * the stripe operations are: + * The stripe operations are:   * -copying data between the stripe cache and user application buffers   * -computing blocks to save a disk access, or to recover a missing block   * -updating the parity on a write operation (reconstruct write and @@ -159,7 +155,8 @@   */  /* - * Operations state - intermediate states that are visible outside of sh->lock + * Operations state - intermediate states that are visible outside of  + *   STRIPE_ACTIVE.   * In general _idle indicates nothing is running, _run indicates a data   * processing operation is active, and _result means the data processing result   * is stable and can be acted upon.  For simple operations like biofill and @@ -200,20 +197,24 @@ enum reconstruct_states {  struct stripe_head {  	struct hlist_node	hash;  	struct list_head	lru;	      /* inactive_list or handle_list */ -	struct raid5_private_data *raid_conf; +	struct llist_node	release_list; +	struct r5conf		*raid_conf;  	short			generation;	/* increments with every  						 * reshape */  	sector_t		sector;		/* sector of this row */  	short			pd_idx;		/* parity disk index */  	short			qd_idx;		/* 'Q' disk index for raid6 */  	short			ddf_layout;/* use DDF ordering to calculate Q */ +	short			hash_lock_index;  	unsigned long		state;		/* state flags */  	atomic_t		count;	      /* nr of active thread/requests */ -	spinlock_t		lock;  	int			bm_seq;	/* sequence number for bitmap flushes */  	int			disks;		/* disks in stripe */  	enum check_states	check_state;  	enum reconstruct_states reconstruct_state; +	spinlock_t		stripe_lock; +	int			cpu; +	struct r5worker_group	*group;  	/**  	 * struct stripe_operations  	 * @target - STRIPE_OP_COMPUTE_BLK target @@ -224,15 +225,14 @@ struct stripe_head {  	struct stripe_operations {  		int 		     target, target2;  		enum sum_check_flags zero_sum_result; -		#ifdef CONFIG_MULTICORE_RAID456 -		unsigned long	     request; -		wait_queue_head_t    wait_for_ops; -		#endif  	} ops;  	struct r5dev { -		struct bio	req; -		struct bio_vec	vec; -		struct page	*page; +		/* rreq and rvec are used for the replacement device when +		 * writing data to both devices. +		 */ +		struct bio	req, rreq; +		struct bio_vec	vec, rvec; +		struct page	*page, *orig_page;  		struct bio	*toread, *read, *towrite, *written;  		sector_t	sector;			/* sector of this page */  		unsigned long	flags; @@ -240,81 +240,106 @@ struct stripe_head {  };  /* stripe_head_state - collects and tracks the dynamic state of a stripe_head - *     for handle_stripe.  It is only valid under spin_lock(sh->lock); + *     for handle_stripe.   */  struct stripe_head_state { -	int syncing, expanding, expanded; +	/* 'syncing' means that we need to read all devices, either +	 * to check/correct parity, or to reconstruct a missing device. +	 * 'replacing' means we are replacing one or more drives and +	 * the source is valid at this point so we don't need to +	 * read all devices, just the replacement targets. +	 */ +	int syncing, expanding, expanded, replacing;  	int locked, uptodate, to_read, to_write, failed, written;  	int to_fill, compute, req_compute, non_overwrite; -	int failed_num; +	int failed_num[2]; +	int p_failed, q_failed; +	int dec_preread_active;  	unsigned long ops_request; -}; -/* r6_state - extra state data only relevant to r6 */ -struct r6_state { -	int p_failed, q_failed, failed_num[2]; +	struct bio *return_bi; +	struct md_rdev *blocked_rdev; +	int handle_bad_blocks;  }; -/* Flags */ -#define	R5_UPTODATE	0	/* page contains current data */ -#define	R5_LOCKED	1	/* IO has been submitted on "req" */ -#define	R5_OVERWRITE	2	/* towrite covers whole page */ +/* Flags for struct r5dev.flags */ +enum r5dev_flags { +	R5_UPTODATE,	/* page contains current data */ +	R5_LOCKED,	/* IO has been submitted on "req" */ +	R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ +	R5_OVERWRITE,	/* towrite covers whole page */  /* and some that are internal to handle_stripe */ -#define	R5_Insync	3	/* rdev && rdev->in_sync at start */ -#define	R5_Wantread	4	/* want to schedule a read */ -#define	R5_Wantwrite	5 -#define	R5_Overlap	7	/* There is a pending overlapping request on this block */ -#define	R5_ReadError	8	/* seen a read error here recently */ -#define	R5_ReWrite	9	/* have tried to over-write the readerror */ - -#define	R5_Expanded	10	/* This block now has post-expand data */ -#define	R5_Wantcompute	11 /* compute_block in progress treat as -				    * uptodate -				    */ -#define	R5_Wantfill	12 /* dev->toread contains a bio that needs -				    * filling -				    */ -#define R5_Wantdrain	13 /* dev->towrite needs to be drained */ -#define R5_WantFUA	14	/* Write should be FUA */ -/* - * Write method - */ -#define RECONSTRUCT_WRITE	1 -#define READ_MODIFY_WRITE	2 -/* not a write method, but a compute_parity mode */ -#define	CHECK_PARITY		3 -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY		4 +	R5_Insync,	/* rdev && rdev->in_sync at start */ +	R5_Wantread,	/* want to schedule a read */ +	R5_Wantwrite, +	R5_Overlap,	/* There is a pending overlapping request +			 * on this block */ +	R5_ReadNoMerge, /* prevent bio from merging in block-layer */ +	R5_ReadError,	/* seen a read error here recently */ +	R5_ReWrite,	/* have tried to over-write the readerror */ + +	R5_Expanded,	/* This block now has post-expand data */ +	R5_Wantcompute,	/* compute_block in progress treat as +			 * uptodate +			 */ +	R5_Wantfill,	/* dev->toread contains a bio that needs +			 * filling +			 */ +	R5_Wantdrain,	/* dev->towrite needs to be drained */ +	R5_WantFUA,	/* Write should be FUA */ +	R5_SyncIO,	/* The IO is sync */ +	R5_WriteError,	/* got a write error - need to record it */ +	R5_MadeGood,	/* A bad block has been fixed by writing to it */ +	R5_ReadRepl,	/* Will/did read from replacement rather than orig */ +	R5_MadeGoodRepl,/* A bad block on the replacement device has been +			 * fixed by writing to it */ +	R5_NeedReplace,	/* This device has a replacement which is not +			 * up-to-date at this stripe. */ +	R5_WantReplace, /* We need to update the replacement, we have read +			 * data in, and now is a good time to write it out. +			 */ +	R5_Discard,	/* Discard the stripe */ +	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */ +};  /*   * Stripe state   */ -#define STRIPE_HANDLE		2 -#define	STRIPE_SYNCING		3 -#define	STRIPE_INSYNC		4 -#define	STRIPE_PREREAD_ACTIVE	5 -#define	STRIPE_DELAYED		6 -#define	STRIPE_DEGRADED		7 -#define	STRIPE_BIT_DELAY	8 -#define	STRIPE_EXPANDING	9 -#define	STRIPE_EXPAND_SOURCE	10 -#define	STRIPE_EXPAND_READY	11 -#define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */ -#define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */ -#define	STRIPE_BIOFILL_RUN	14 -#define	STRIPE_COMPUTE_RUN	15 -#define	STRIPE_OPS_REQ_PENDING	16 +enum { +	STRIPE_ACTIVE, +	STRIPE_HANDLE, +	STRIPE_SYNC_REQUESTED, +	STRIPE_SYNCING, +	STRIPE_INSYNC, +	STRIPE_REPLACED, +	STRIPE_PREREAD_ACTIVE, +	STRIPE_DELAYED, +	STRIPE_DEGRADED, +	STRIPE_BIT_DELAY, +	STRIPE_EXPANDING, +	STRIPE_EXPAND_SOURCE, +	STRIPE_EXPAND_READY, +	STRIPE_IO_STARTED,	/* do not count towards 'bypass_count' */ +	STRIPE_FULL_WRITE,	/* all blocks are set to be overwritten */ +	STRIPE_BIOFILL_RUN, +	STRIPE_COMPUTE_RUN, +	STRIPE_OPS_REQ_PENDING, +	STRIPE_ON_UNPLUG_LIST, +	STRIPE_DISCARD, +	STRIPE_ON_RELEASE_LIST, +};  /*   * Operation request flags   */ -#define STRIPE_OP_BIOFILL	0 -#define STRIPE_OP_COMPUTE_BLK	1 -#define STRIPE_OP_PREXOR	2 -#define STRIPE_OP_BIODRAIN	3 -#define STRIPE_OP_RECONSTRUCT	4 -#define STRIPE_OP_CHECK	5 - +enum { +	STRIPE_OP_BIOFILL, +	STRIPE_OP_COMPUTE_BLK, +	STRIPE_OP_PREXOR, +	STRIPE_OP_BIODRAIN, +	STRIPE_OP_RECONSTRUCT, +	STRIPE_OP_CHECK, +};  /*   * Plugging:   * @@ -336,18 +361,41 @@ struct r6_state {   * PREREAD_ACTIVE.   * In stripe_handle, if we find pre-reading is necessary, we do it if   * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leave nothing locked. + * HANDLE gets cleared if stripe_handle leaves nothing locked.   */  struct disk_info { -	mdk_rdev_t	*rdev; +	struct md_rdev	*rdev, *replacement;  }; -struct raid5_private_data { +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + +struct r5worker { +	struct work_struct work; +	struct r5worker_group *group; +	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; +	bool working; +}; + +struct r5worker_group { +	struct list_head handle_list; +	struct r5conf *conf; +	struct r5worker *workers; +	int stripes_cnt; +}; + +struct r5conf {  	struct hlist_head	*stripe_hashtbl; -	mddev_t			*mddev; -	struct disk_info	*spare; +	/* only protect corresponding hash list and inactive_list */ +	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS]; +	struct mddev		*mddev;  	int			chunk_sectors;  	int			level, algorithm;  	int			max_degraded; @@ -368,8 +416,15 @@ struct raid5_private_data {  	int			prev_chunk_sectors;  	int			prev_algo;  	short			generation; /* increments with every reshape */ +	seqcount_t		gen_lock;	/* lock against generation changes */  	unsigned long		reshape_checkpoint; /* Time we last updated  						     * metadata */ +	long long		min_offset_diff; /* minimum difference between +						  * data_offset and +						  * new_data_offset across all +						  * devices.  May be negative, +						  * but is closest to zero. +						  */  	struct list_head	handle_list; /* stripes needing handling */  	struct list_head	hold_list; /* preread ready stripes */ @@ -382,6 +437,7 @@ struct raid5_private_data {  	atomic_t		pending_full_writes; /* full write backlog */  	int			bypass_count; /* bypassed prereads */  	int			bypass_threshold; /* preread nice */ +	int			skip_copy; /* Don't copy data from bio to stripe cache */  	struct list_head	*last_hold; /* detect hold_list promotions */  	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */ @@ -399,9 +455,7 @@ struct raid5_private_data {  					    * (fresh device added).  					    * Cleared when a sync completes.  					    */ - -	struct plug_handle	plug; - +	int			recovery_disabled;  	/* per cpu variables */  	struct raid5_percpu {  		struct page	*spare_page; /* Used when checking P/Q in raid6 */ @@ -422,7 +476,9 @@ struct raid5_private_data {  	 * Free stripes pool  	 */  	atomic_t		active_stripes; -	struct list_head	inactive_list; +	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS]; +	atomic_t		empty_inactive_list_nr; +	struct llist_head	released_stripes;  	wait_queue_head_t	wait_for_stripe;  	wait_queue_head_t	wait_for_overlap;  	int			inactive_blocked;	/* release of inactive stripes blocked, @@ -435,11 +491,13 @@ struct raid5_private_data {  	/* When taking over an array from a different personality, we store  	 * the new thread here until we fully activate the array.  	 */ -	struct mdk_thread_s	*thread; +	struct md_thread	*thread; +	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS]; +	struct r5worker_group	*worker_groups; +	int			group_cnt; +	int			worker_cnt_per_group;  }; -typedef struct raid5_private_data raid5_conf_t; -  /*   * Our supported algorithms   */ @@ -502,7 +560,7 @@ static inline int algorithm_is_DDF(int layout)  	return layout >= 8 && layout <= 10;  } -extern int md_raid5_congested(mddev_t *mddev, int bits); -extern void md_raid5_unplug_device(raid5_conf_t *conf); -extern int raid5_set_cache_size(mddev_t *mddev, int size); +extern int md_raid5_congested(struct mddev *mddev, int bits); +extern void md_raid5_kick_device(struct r5conf *conf); +extern int raid5_set_cache_size(struct mddev *mddev, int size);  #endif  | 
