diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 825 | 
1 files changed, 570 insertions, 255 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ff4f252ca1..6234b2e8458 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)  	return &conf->stripe_hashtbl[hash];  } +static inline int stripe_hash_locks_hash(sector_t sect) +{ +	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ +	spin_lock_irq(conf->hash_locks + hash); +	spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ +	spin_unlock(&conf->device_lock); +	spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ +	int i; +	local_irq_disable(); +	spin_lock(conf->hash_locks); +	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) +		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); +	spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ +	int i; +	spin_unlock(&conf->device_lock); +	for (i = NR_STRIPE_HASH_LOCKS; i; i--) +		spin_unlock(conf->hash_locks + i - 1); +	local_irq_enable(); +} +  /* bio's attached to a stripe+device for I/O are linked together in bi_sector   * order without overlap.  There may be several bio's per stripe+device, and   * a bio could span several devices. @@ -97,7 +133,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)  static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)  {  	int sectors = bio_sectors(bio); -	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) +	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)  		return bio->bi_next;  	else  		return NULL; @@ -189,7 +225,7 @@ static void return_io(struct bio *return_bi)  		return_bi = bi->bi_next;  		bi->bi_next = NULL; -		bi->bi_size = 0; +		bi->bi_iter.bi_size = 0;  		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),  					 bi, 0);  		bio_endio(bi, 0); @@ -249,15 +285,19 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)  	}  } -static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, +			      struct list_head *temp_inactive_list)  {  	BUG_ON(!list_empty(&sh->lru));  	BUG_ON(atomic_read(&conf->active_stripes)==0);  	if (test_bit(STRIPE_HANDLE, &sh->state)) {  		if (test_bit(STRIPE_DELAYED, &sh->state) && -		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) +		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {  			list_add_tail(&sh->lru, &conf->delayed_list); -		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && +			if (atomic_read(&conf->preread_active_stripes) +			    < IO_THRESHOLD) +				md_wakeup_thread(conf->mddev->thread); +		} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&  			   sh->bm_seq - conf->seq_write > 0)  			list_add_tail(&sh->lru, &conf->bitmap_list);  		else { @@ -278,37 +318,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)  			    < IO_THRESHOLD)  				md_wakeup_thread(conf->mddev->thread);  		atomic_dec(&conf->active_stripes); -		if (!test_bit(STRIPE_EXPANDING, &sh->state)) { -			list_add_tail(&sh->lru, &conf->inactive_list); -			wake_up(&conf->wait_for_stripe); -			if (conf->retry_read_aligned) -				md_wakeup_thread(conf->mddev->thread); -		} +		if (!test_bit(STRIPE_EXPANDING, &sh->state)) +			list_add_tail(&sh->lru, temp_inactive_list);  	}  } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, +			     struct list_head *temp_inactive_list)  {  	if (atomic_dec_and_test(&sh->count)) -		do_release_stripe(conf, sh); +		do_release_stripe(conf, sh, temp_inactive_list);  } -static struct llist_node *llist_reverse_order(struct llist_node *head) +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, +					 struct list_head *temp_inactive_list, +					 int hash)  { -	struct llist_node *new_head = NULL; +	int size; +	bool do_wakeup = false; +	unsigned long flags; -	while (head) { -		struct llist_node *tmp = head; -		head = head->next; -		tmp->next = new_head; -		new_head = tmp; +	if (hash == NR_STRIPE_HASH_LOCKS) { +		size = NR_STRIPE_HASH_LOCKS; +		hash = NR_STRIPE_HASH_LOCKS - 1; +	} else +		size = 1; +	while (size) { +		struct list_head *list = &temp_inactive_list[size - 1]; + +		/* +		 * We don't hold any lock here yet, get_active_stripe() might +		 * remove stripes from the list +		 */ +		if (!list_empty_careful(list)) { +			spin_lock_irqsave(conf->hash_locks + hash, flags); +			if (list_empty(conf->inactive_list + hash) && +			    !list_empty(list)) +				atomic_dec(&conf->empty_inactive_list_nr); +			list_splice_tail_init(list, conf->inactive_list + hash); +			do_wakeup = true; +			spin_unlock_irqrestore(conf->hash_locks + hash, flags); +		} +		size--; +		hash--;  	} -	return new_head; +	if (do_wakeup) { +		wake_up(&conf->wait_for_stripe); +		if (conf->retry_read_aligned) +			md_wakeup_thread(conf->mddev->thread); +	}  }  /* should hold conf->device_lock already */ -static int release_stripe_list(struct r5conf *conf) +static int release_stripe_list(struct r5conf *conf, +			       struct list_head *temp_inactive_list)  {  	struct stripe_head *sh;  	int count = 0; @@ -317,6 +388,8 @@ static int release_stripe_list(struct r5conf *conf)  	head = llist_del_all(&conf->released_stripes);  	head = llist_reverse_order(head);  	while (head) { +		int hash; +  		sh = llist_entry(head, struct stripe_head, release_list);  		head = llist_next(head);  		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ @@ -327,7 +400,8 @@ static int release_stripe_list(struct r5conf *conf)  		 * again, the count is always > 1. This is true for  		 * STRIPE_ON_UNPLUG_LIST bit too.  		 */ -		__release_stripe(conf, sh); +		hash = sh->hash_lock_index; +		__release_stripe(conf, sh, &temp_inactive_list[hash]);  		count++;  	} @@ -338,9 +412,17 @@ static void release_stripe(struct stripe_head *sh)  {  	struct r5conf *conf = sh->raid_conf;  	unsigned long flags; +	struct list_head list; +	int hash;  	bool wakeup; -	if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) +	/* Avoid release_list until the last reference. +	 */ +	if (atomic_add_unless(&sh->count, -1, 1)) +		return; + +	if (unlikely(!conf->mddev->thread) || +		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))  		goto slow_path;  	wakeup = llist_add(&sh->release_list, &conf->released_stripes);  	if (wakeup) @@ -350,8 +432,11 @@ slow_path:  	local_irq_save(flags);  	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */  	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { -		do_release_stripe(conf, sh); +		INIT_LIST_HEAD(&list); +		hash = sh->hash_lock_index; +		do_release_stripe(conf, sh, &list);  		spin_unlock(&conf->device_lock); +		release_inactive_stripe_list(conf, &list, hash);  	}  	local_irq_restore(flags);  } @@ -376,18 +461,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)  /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)  {  	struct stripe_head *sh = NULL;  	struct list_head *first; -	if (list_empty(&conf->inactive_list)) +	if (list_empty(conf->inactive_list + hash))  		goto out; -	first = conf->inactive_list.next; +	first = (conf->inactive_list + hash)->next;  	sh = list_entry(first, struct stripe_head, lru);  	list_del_init(first);  	remove_hash(sh);  	atomic_inc(&conf->active_stripes); +	BUG_ON(hash != sh->hash_lock_index); +	if (list_empty(conf->inactive_list + hash)) +		atomic_inc(&conf->empty_inactive_list_nr);  out:  	return sh;  } @@ -399,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)  	int num = sh->raid_conf->pool_size;  	for (i = 0; i < num ; i++) { +		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);  		p = sh->dev[i].page;  		if (!p)  			continue; @@ -419,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)  			return 1;  		}  		sh->dev[i].page = page; +		sh->dev[i].orig_page = page;  	}  	return 0;  } @@ -430,7 +520,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,  static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)  {  	struct r5conf *conf = sh->raid_conf; -	int i; +	int i, seq;  	BUG_ON(atomic_read(&sh->count) != 0);  	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -440,7 +530,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)  		(unsigned long long)sh->sector);  	remove_hash(sh); - +retry: +	seq = read_seqcount_begin(&conf->gen_lock);  	sh->generation = conf->generation - previous;  	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;  	sh->sector = sector; @@ -462,6 +553,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)  		dev->flags = 0;  		raid5_build_block(sh, i, previous);  	} +	if (read_seqcount_retry(&conf->gen_lock, seq)) +		goto retry;  	insert_hash(conf, sh);  	sh->cpu = smp_processor_id();  } @@ -566,57 +659,55 @@ get_active_stripe(struct r5conf *conf, sector_t sector,  		  int previous, int noblock, int noquiesce)  {  	struct stripe_head *sh; +	int hash = stripe_hash_locks_hash(sector);  	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); -	spin_lock_irq(&conf->device_lock); +	spin_lock_irq(conf->hash_locks + hash);  	do {  		wait_event_lock_irq(conf->wait_for_stripe,  				    conf->quiesce == 0 || noquiesce, -				    conf->device_lock); +				    *(conf->hash_locks + hash));  		sh = __find_stripe(conf, sector, conf->generation - previous);  		if (!sh) {  			if (!conf->inactive_blocked) -				sh = get_free_stripe(conf); +				sh = get_free_stripe(conf, hash);  			if (noblock && sh == NULL)  				break;  			if (!sh) {  				conf->inactive_blocked = 1; -				wait_event_lock_irq(conf->wait_for_stripe, -						    !list_empty(&conf->inactive_list) && -						    (atomic_read(&conf->active_stripes) -						     < (conf->max_nr_stripes *3/4) -						     || !conf->inactive_blocked), -						    conf->device_lock); +				wait_event_lock_irq( +					conf->wait_for_stripe, +					!list_empty(conf->inactive_list + hash) && +					(atomic_read(&conf->active_stripes) +					 < (conf->max_nr_stripes * 3 / 4) +					 || !conf->inactive_blocked), +					*(conf->hash_locks + hash));  				conf->inactive_blocked = 0; -			} else -				init_stripe(sh, sector, previous); -		} else { -			if (atomic_read(&sh->count)) { -				BUG_ON(!list_empty(&sh->lru) -				    && !test_bit(STRIPE_EXPANDING, &sh->state) -				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) -				    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));  			} else { +				init_stripe(sh, sector, previous); +				atomic_inc(&sh->count); +			} +		} else if (!atomic_inc_not_zero(&sh->count)) { +			spin_lock(&conf->device_lock); +			if (!atomic_read(&sh->count)) {  				if (!test_bit(STRIPE_HANDLE, &sh->state))  					atomic_inc(&conf->active_stripes); -				if (list_empty(&sh->lru) && -				    !test_bit(STRIPE_EXPANDING, &sh->state)) -					BUG(); +				BUG_ON(list_empty(&sh->lru) && +				       !test_bit(STRIPE_EXPANDING, &sh->state));  				list_del_init(&sh->lru);  				if (sh->group) {  					sh->group->stripes_cnt--;  					sh->group = NULL;  				}  			} +			atomic_inc(&sh->count); +			spin_unlock(&conf->device_lock);  		}  	} while (sh == NULL); -	if (sh) -		atomic_inc(&sh->count); - -	spin_unlock_irq(&conf->device_lock); +	spin_unlock_irq(conf->hash_locks + hash);  	return sh;  } @@ -766,18 +857,27 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  				bi->bi_rw, i);  			atomic_inc(&sh->count);  			if (use_new_offset(conf, sh)) -				bi->bi_sector = (sh->sector +				bi->bi_iter.bi_sector = (sh->sector  						 + rdev->new_data_offset);  			else -				bi->bi_sector = (sh->sector +				bi->bi_iter.bi_sector = (sh->sector  						 + rdev->data_offset);  			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) -				bi->bi_rw |= REQ_FLUSH; +				bi->bi_rw |= REQ_NOMERGE; +			if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) +				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); +			sh->dev[i].vec.bv_page = sh->dev[i].page;  			bi->bi_vcnt = 1;  			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;  			bi->bi_io_vec[0].bv_offset = 0; -			bi->bi_size = STRIPE_SIZE; +			bi->bi_iter.bi_size = STRIPE_SIZE; +			/* +			 * If this is discard request, set bi_vcnt 0. We don't +			 * want to confuse SCSI because SCSI will replace payload +			 */ +			if (rw & REQ_DISCARD) +				bi->bi_vcnt = 0;  			if (rrdev)  				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -807,15 +907,24 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  				rbi->bi_rw, i);  			atomic_inc(&sh->count);  			if (use_new_offset(conf, sh)) -				rbi->bi_sector = (sh->sector +				rbi->bi_iter.bi_sector = (sh->sector  						  + rrdev->new_data_offset);  			else -				rbi->bi_sector = (sh->sector +				rbi->bi_iter.bi_sector = (sh->sector  						  + rrdev->data_offset); +			if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) +				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); +			sh->dev[i].rvec.bv_page = sh->dev[i].page;  			rbi->bi_vcnt = 1;  			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;  			rbi->bi_io_vec[0].bv_offset = 0; -			rbi->bi_size = STRIPE_SIZE; +			rbi->bi_iter.bi_size = STRIPE_SIZE; +			/* +			 * If this is discard request, set bi_vcnt 0. We don't +			 * want to confuse SCSI because SCSI will replace payload +			 */ +			if (rw & REQ_DISCARD) +				rbi->bi_vcnt = 0;  			if (conf->mddev->gendisk)  				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),  						      rbi, disk_devt(conf->mddev->gendisk), @@ -834,27 +943,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  }  static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, -	sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, +	sector_t sector, struct dma_async_tx_descriptor *tx, +	struct stripe_head *sh)  { -	struct bio_vec *bvl; +	struct bio_vec bvl; +	struct bvec_iter iter;  	struct page *bio_page; -	int i;  	int page_offset;  	struct async_submit_ctl submit;  	enum async_tx_flags flags = 0; -	if (bio->bi_sector >= sector) -		page_offset = (signed)(bio->bi_sector - sector) * 512; +	if (bio->bi_iter.bi_sector >= sector) +		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;  	else -		page_offset = (signed)(sector - bio->bi_sector) * -512; +		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;  	if (frombio)  		flags |= ASYNC_TX_FENCE;  	init_async_submit(&submit, flags, tx, NULL, NULL, NULL); -	bio_for_each_segment(bvl, bio, i) { -		int len = bvl->bv_len; +	bio_for_each_segment(bvl, bio, iter) { +		int len = bvl.bv_len;  		int clen;  		int b_offset = 0; @@ -870,13 +980,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,  			clen = len;  		if (clen > 0) { -			b_offset += bvl->bv_offset; -			bio_page = bvl->bv_page; -			if (frombio) -				tx = async_memcpy(page, bio_page, page_offset, +			b_offset += bvl.bv_offset; +			bio_page = bvl.bv_page; +			if (frombio) { +				if (sh->raid_conf->skip_copy && +				    b_offset == 0 && page_offset == 0 && +				    clen == STRIPE_SIZE) +					*page = bio_page; +				else +					tx = async_memcpy(*page, bio_page, page_offset,  						  b_offset, clen, &submit); -			else -				tx = async_memcpy(bio_page, page, b_offset, +			} else +				tx = async_memcpy(bio_page, *page, b_offset,  						  page_offset, clen, &submit);  		}  		/* chain the operations */ @@ -914,7 +1029,7 @@ static void ops_complete_biofill(void *stripe_head_ref)  			BUG_ON(!dev->read);  			rbi = dev->read;  			dev->read = NULL; -			while (rbi && rbi->bi_sector < +			while (rbi && rbi->bi_iter.bi_sector <  				dev->sector + STRIPE_SECTORS) {  				rbi2 = r5_next_bio(rbi, dev->sector);  				if (!raid5_dec_bi_active_stripes(rbi)) { @@ -950,10 +1065,10 @@ static void ops_run_biofill(struct stripe_head *sh)  			dev->read = rbi = dev->toread;  			dev->toread = NULL;  			spin_unlock_irq(&sh->stripe_lock); -			while (rbi && rbi->bi_sector < +			while (rbi && rbi->bi_iter.bi_sector <  				dev->sector + STRIPE_SECTORS) { -				tx = async_copy_data(0, rbi, dev->page, -					dev->sector, tx); +				tx = async_copy_data(0, rbi, &dev->page, +					dev->sector, tx, sh);  				rbi = r5_next_bio(rbi, dev->sector);  			}  		} @@ -1291,8 +1406,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)  			BUG_ON(dev->written);  			wbi = dev->written = chosen;  			spin_unlock_irq(&sh->stripe_lock); +			WARN_ON(dev->page != dev->orig_page); -			while (wbi && wbi->bi_sector < +			while (wbi && wbi->bi_iter.bi_sector <  				dev->sector + STRIPE_SECTORS) {  				if (wbi->bi_rw & REQ_FUA)  					set_bit(R5_WantFUA, &dev->flags); @@ -1300,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)  					set_bit(R5_SyncIO, &dev->flags);  				if (wbi->bi_rw & REQ_DISCARD)  					set_bit(R5_Discard, &dev->flags); -				else -					tx = async_copy_data(1, wbi, dev->page, -						dev->sector, tx); +				else { +					tx = async_copy_data(1, wbi, &dev->page, +						dev->sector, tx, sh); +					if (dev->page != dev->orig_page) { +						set_bit(R5_SkipCopy, &dev->flags); +						clear_bit(R5_UPTODATE, &dev->flags); +						clear_bit(R5_OVERWRITE, &dev->flags); +					} +				}  				wbi = r5_next_bio(wbi, dev->sector);  			}  		} @@ -1333,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)  		struct r5dev *dev = &sh->dev[i];  		if (dev->written || i == pd_idx || i == qd_idx) { -			if (!discard) +			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))  				set_bit(R5_UPTODATE, &dev->flags);  			if (fua)  				set_bit(R5_WantFUA, &dev->flags); @@ -1584,7 +1706,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  	put_cpu();  } -static int grow_one_stripe(struct r5conf *conf) +static int grow_one_stripe(struct r5conf *conf, int hash)  {  	struct stripe_head *sh;  	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); @@ -1600,6 +1722,7 @@ static int grow_one_stripe(struct r5conf *conf)  		kmem_cache_free(conf->slab_cache, sh);  		return 0;  	} +	sh->hash_lock_index = hash;  	/* we just created an active stripe so... */  	atomic_set(&sh->count, 1);  	atomic_inc(&conf->active_stripes); @@ -1612,6 +1735,7 @@ static int grow_stripes(struct r5conf *conf, int num)  {  	struct kmem_cache *sc;  	int devs = max(conf->raid_disks, conf->previous_raid_disks); +	int hash;  	if (conf->mddev->gendisk)  		sprintf(conf->cache_name[0], @@ -1629,9 +1753,13 @@ static int grow_stripes(struct r5conf *conf, int num)  		return 1;  	conf->slab_cache = sc;  	conf->pool_size = devs; -	while (num--) -		if (!grow_one_stripe(conf)) +	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; +	while (num--) { +		if (!grow_one_stripe(conf, hash))  			return 1; +		conf->max_nr_stripes++; +		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; +	}  	return 0;  } @@ -1689,6 +1817,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	int err;  	struct kmem_cache *sc;  	int i; +	int hash, cnt;  	if (newsize <= conf->pool_size)  		return 0; /* never bother to shrink */ @@ -1728,19 +1857,31 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	 * OK, we have enough stripes, start collecting inactive  	 * stripes and copying them over  	 */ +	hash = 0; +	cnt = 0;  	list_for_each_entry(nsh, &newstripes, lru) { -		spin_lock_irq(&conf->device_lock); -		wait_event_lock_irq(conf->wait_for_stripe, -				    !list_empty(&conf->inactive_list), -				    conf->device_lock); -		osh = get_free_stripe(conf); -		spin_unlock_irq(&conf->device_lock); +		lock_device_hash_lock(conf, hash); +		wait_event_cmd(conf->wait_for_stripe, +				    !list_empty(conf->inactive_list + hash), +				    unlock_device_hash_lock(conf, hash), +				    lock_device_hash_lock(conf, hash)); +		osh = get_free_stripe(conf, hash); +		unlock_device_hash_lock(conf, hash);  		atomic_set(&nsh->count, 1); -		for(i=0; i<conf->pool_size; i++) +		for(i=0; i<conf->pool_size; i++) {  			nsh->dev[i].page = osh->dev[i].page; +			nsh->dev[i].orig_page = osh->dev[i].page; +		}  		for( ; i<newsize; i++)  			nsh->dev[i].page = NULL; +		nsh->hash_lock_index = hash;  		kmem_cache_free(conf->slab_cache, osh); +		cnt++; +		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + +		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { +			hash++; +			cnt = 0; +		}  	}  	kmem_cache_destroy(conf->slab_cache); @@ -1786,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)  			if (nsh->dev[i].page == NULL) {  				struct page *p = alloc_page(GFP_NOIO);  				nsh->dev[i].page = p; +				nsh->dev[i].orig_page = p;  				if (!p)  					err = -ENOMEM;  			} @@ -1799,13 +1941,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	return err;  } -static int drop_one_stripe(struct r5conf *conf) +static int drop_one_stripe(struct r5conf *conf, int hash)  {  	struct stripe_head *sh; -	spin_lock_irq(&conf->device_lock); -	sh = get_free_stripe(conf); -	spin_unlock_irq(&conf->device_lock); +	spin_lock_irq(conf->hash_locks + hash); +	sh = get_free_stripe(conf, hash); +	spin_unlock_irq(conf->hash_locks + hash);  	if (!sh)  		return 0;  	BUG_ON(atomic_read(&sh->count)); @@ -1817,8 +1959,10 @@ static int drop_one_stripe(struct r5conf *conf)  static void shrink_stripes(struct r5conf *conf)  { -	while (drop_one_stripe(conf)) -		; +	int hash; +	for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) +		while (drop_one_stripe(conf, hash)) +			;  	if (conf->slab_cache)  		kmem_cache_destroy(conf->slab_cache); @@ -1923,6 +2067,9 @@ static void raid5_end_read_request(struct bio * bi, int error)  			       mdname(conf->mddev), bdn);  		else  			retry = 1; +		if (set_bad && test_bit(In_sync, &rdev->flags) +		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) +			retry = 1;  		if (retry)  			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {  				set_bit(R5_ReadError, &sh->dev[i].flags); @@ -1991,6 +2138,7 @@ static void raid5_end_write_request(struct bio *bi, int error)  			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);  	} else {  		if (!uptodate) { +			set_bit(STRIPE_DEGRADED, &sh->state);  			set_bit(WriteErrorSeen, &rdev->flags);  			set_bit(R5_WriteError, &sh->dev[i].flags);  			if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2017,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)  }  static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -	 +  static void raid5_build_block(struct stripe_head *sh, int i, int previous)  {  	struct r5dev *dev = &sh->dev[i];  	bio_init(&dev->req);  	dev->req.bi_io_vec = &dev->vec; -	dev->req.bi_vcnt++; -	dev->req.bi_max_vecs++; +	dev->req.bi_max_vecs = 1;  	dev->req.bi_private = sh; -	dev->vec.bv_page = dev->page;  	bio_init(&dev->rreq);  	dev->rreq.bi_io_vec = &dev->rvec; -	dev->rreq.bi_vcnt++; -	dev->rreq.bi_max_vecs++; +	dev->rreq.bi_max_vecs = 1;  	dev->rreq.bi_private = sh; -	dev->rvec.bv_page = dev->page;  	dev->flags = 0;  	dev->sector = compute_blocknr(sh, i, previous); @@ -2494,7 +2638,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in  	int firstwrite=0;  	pr_debug("adding bi b#%llu to stripe s#%llu\n", -		(unsigned long long)bi->bi_sector, +		(unsigned long long)bi->bi_iter.bi_sector,  		(unsigned long long)sh->sector);  	/* @@ -2512,12 +2656,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in  			firstwrite = 1;  	} else  		bip = &sh->dev[dd_idx].toread; -	while (*bip && (*bip)->bi_sector < bi->bi_sector) { -		if (bio_end_sector(*bip) > bi->bi_sector) +	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { +		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)  			goto overlap;  		bip = & (*bip)->bi_next;  	} -	if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) +	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))  		goto overlap;  	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); @@ -2531,7 +2675,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in  		sector_t sector = sh->dev[dd_idx].sector;  		for (bi=sh->dev[dd_idx].towrite;  		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && -			     bi && bi->bi_sector <= sector; +			     bi && bi->bi_iter.bi_sector <= sector;  		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {  			if (bio_end_sector(bi) >= sector)  				sector = bio_end_sector(bi); @@ -2541,7 +2685,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in  	}  	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", -		(unsigned long long)(*bip)->bi_sector, +		(unsigned long long)(*bip)->bi_iter.bi_sector,  		(unsigned long long)sh->sector, dd_idx);  	spin_unlock_irq(&sh->stripe_lock); @@ -2616,7 +2760,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))  			wake_up(&conf->wait_for_overlap); -		while (bi && bi->bi_sector < +		while (bi && bi->bi_iter.bi_sector <  			sh->dev[i].sector + STRIPE_SECTORS) {  			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);  			clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -2634,8 +2778,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  		/* and fail all 'written' */  		bi = sh->dev[i].written;  		sh->dev[i].written = NULL; +		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { +			WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); +			sh->dev[i].page = sh->dev[i].orig_page; +		} +  		if (bi) bitmap_end = 1; -		while (bi && bi->bi_sector < +		while (bi && bi->bi_iter.bi_sector <  		       sh->dev[i].sector + STRIPE_SECTORS) {  			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);  			clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -2659,7 +2808,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  			spin_unlock_irq(&sh->stripe_lock);  			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))  				wake_up(&conf->wait_for_overlap); -			while (bi && bi->bi_sector < +			while (bi && bi->bi_iter.bi_sector <  			       sh->dev[i].sector + STRIPE_SECTORS) {  				struct bio *nextbi =  					r5_next_bio(bi, sh->dev[i].sector); @@ -2770,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,  	     (s->failed >= 1 && fdev[0]->toread) ||  	     (s->failed >= 2 && fdev[1]->toread) ||  	     (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && +	      (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&  	      !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || -	     (sh->raid_conf->level == 6 && s->failed && s->to_write))) { +	     (sh->raid_conf->level == 6 && s->failed && s->to_write && +	      s->to_write < sh->raid_conf->raid_disks - 2 && +	      (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {  		/* we would like to get this block, possibly by computing it,  		 * otherwise read it if the backing disk is insync  		 */ @@ -2875,15 +3027,20 @@ static void handle_stripe_clean_event(struct r5conf *conf,  			dev = &sh->dev[i];  			if (!test_bit(R5_LOCKED, &dev->flags) &&  			    (test_bit(R5_UPTODATE, &dev->flags) || -			     test_bit(R5_Discard, &dev->flags))) { +			     test_bit(R5_Discard, &dev->flags) || +			     test_bit(R5_SkipCopy, &dev->flags))) {  				/* We can return any write requests */  				struct bio *wbi, *wbi2;  				pr_debug("Return write for disc %d\n", i);  				if (test_and_clear_bit(R5_Discard, &dev->flags))  					clear_bit(R5_UPTODATE, &dev->flags); +				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { +					WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); +					dev->page = dev->orig_page; +				}  				wbi = dev->written;  				dev->written = NULL; -				while (wbi && wbi->bi_sector < +				while (wbi && wbi->bi_iter.bi_sector <  					dev->sector + STRIPE_SECTORS) {  					wbi2 = r5_next_bio(wbi, dev->sector);  					if (!raid5_dec_bi_active_stripes(wbi)) { @@ -2899,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,  						0);  			} else if (test_bit(R5_Discard, &dev->flags))  				discard_pending = 1; +			WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); +			WARN_ON(dev->page != dev->orig_page);  		}  	if (!discard_pending &&  	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -2910,6 +3069,14 @@ static void handle_stripe_clean_event(struct r5conf *conf,  		}  		/* now that discard is done we can proceed with any sync */  		clear_bit(STRIPE_DISCARD, &sh->state); +		/* +		 * SCSI discard will change some bio fields and the stripe has +		 * no updated data, so remove it from hash list and the stripe +		 * will be reinitialized +		 */ +		spin_lock_irq(&conf->device_lock); +		remove_hash(sh); +		spin_unlock_irq(&conf->device_lock);  		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))  			set_bit(STRIPE_HANDLE, &sh->state); @@ -2962,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,  		    !test_bit(R5_LOCKED, &dev->flags) &&  		    !(test_bit(R5_UPTODATE, &dev->flags) ||  		    test_bit(R5_Wantcompute, &dev->flags))) { -			if (test_bit(R5_Insync, &dev->flags)) rcw++; +			if (test_bit(R5_Insync, &dev->flags)) +				rcw++;  			else  				rcw += 2*disks;  		} @@ -2983,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,  			    !(test_bit(R5_UPTODATE, &dev->flags) ||  			    test_bit(R5_Wantcompute, &dev->flags)) &&  			    test_bit(R5_Insync, &dev->flags)) { -				if ( -				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { -					pr_debug("Read_old block " -						 "%d for r-m-w\n", i); +				if (test_bit(STRIPE_PREREAD_ACTIVE, +					     &sh->state)) { +					pr_debug("Read_old block %d for r-m-w\n", +						 i);  					set_bit(R5_LOCKED, &dev->flags);  					set_bit(R5_Wantread, &dev->flags);  					s->locked++; @@ -3009,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,  			    !(test_bit(R5_UPTODATE, &dev->flags) ||  			      test_bit(R5_Wantcompute, &dev->flags))) {  				rcw++; -				if (!test_bit(R5_Insync, &dev->flags)) -					continue; /* it's a failed drive */ -				if ( -				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { +				if (test_bit(R5_Insync, &dev->flags) && +				    test_bit(STRIPE_PREREAD_ACTIVE, +					     &sh->state)) {  					pr_debug("Read_old block "  						"%d for Reconstruct\n", i);  					set_bit(R5_LOCKED, &dev->flags); @@ -3481,7 +3648,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)  			 */  			set_bit(R5_Insync, &dev->flags); -		if (rdev && test_bit(R5_WriteError, &dev->flags)) { +		if (test_bit(R5_WriteError, &dev->flags)) {  			/* This flag does not apply to '.replacement'  			 * only to .rdev, so make sure to check that*/  			struct md_rdev *rdev2 = rcu_dereference( @@ -3494,7 +3661,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)  			} else  				clear_bit(R5_WriteError, &dev->flags);  		} -		if (rdev && test_bit(R5_MadeGood, &dev->flags)) { +		if (test_bit(R5_MadeGood, &dev->flags)) {  			/* This flag does not apply to '.replacement'  			 * only to .rdev, so make sure to check that*/  			struct md_rdev *rdev2 = rcu_dereference( @@ -3894,7 +4061,8 @@ static void raid5_activate_delayed(struct r5conf *conf)  	}  } -static void activate_bit_delay(struct r5conf *conf) +static void activate_bit_delay(struct r5conf *conf, +	struct list_head *temp_inactive_list)  {  	/* device_lock is held */  	struct list_head head; @@ -3902,9 +4070,11 @@ static void activate_bit_delay(struct r5conf *conf)  	list_del_init(&conf->bitmap_list);  	while (!list_empty(&head)) {  		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); +		int hash;  		list_del_init(&sh->lru);  		atomic_inc(&sh->count); -		__release_stripe(conf, sh); +		hash = sh->hash_lock_index; +		__release_stripe(conf, sh, &temp_inactive_list[hash]);  	}  } @@ -3920,7 +4090,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)  		return 1;  	if (conf->quiesce)  		return 1; -	if (list_empty_careful(&conf->inactive_list)) +	if (atomic_read(&conf->empty_inactive_list_nr))  		return 1;  	return 0; @@ -3964,7 +4134,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,  static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)  { -	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); +	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);  	unsigned int chunk_sectors = mddev->chunk_sectors;  	unsigned int bio_sectors = bio_sectors(bio); @@ -4101,9 +4271,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  	/*  	 *	compute position  	 */ -	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector, -						    0, -						    &dd_idx, NULL); +	align_bi->bi_iter.bi_sector = +		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, +				     0, &dd_idx, NULL);  	end_sector = bio_end_sector(align_bi);  	rcu_read_lock(); @@ -4128,7 +4298,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);  		if (!bio_fits_rdev(align_bi) || -		    is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), +		    is_badblock(rdev, align_bi->bi_iter.bi_sector, +				bio_sectors(align_bi),  				&first_bad, &bad_sectors)) {  			/* too big in some way, or has a known bad block */  			bio_put(align_bi); @@ -4137,7 +4308,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  		}  		/* No reshape active, so we can trust rdev->data_offset */ -		align_bi->bi_sector += rdev->data_offset; +		align_bi->bi_iter.bi_sector += rdev->data_offset;  		spin_lock_irq(&conf->device_lock);  		wait_event_lock_irq(conf->wait_for_stripe, @@ -4149,7 +4320,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  		if (mddev->gendisk)  			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),  					      align_bi, disk_devt(mddev->gendisk), -					      raid_bio->bi_sector); +					      raid_bio->bi_iter.bi_sector);  		generic_make_request(align_bi);  		return 1;  	} else { @@ -4242,14 +4413,14 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)  		sh->group = NULL;  	}  	list_del_init(&sh->lru); -	atomic_inc(&sh->count); -	BUG_ON(atomic_read(&sh->count) != 1); +	BUG_ON(atomic_inc_return(&sh->count) != 1);  	return sh;  }  struct raid5_plug_cb {  	struct blk_plug_cb	cb;  	struct list_head	list; +	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];  };  static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) @@ -4260,6 +4431,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)  	struct mddev *mddev = cb->cb.data;  	struct r5conf *conf = mddev->private;  	int cnt = 0; +	int hash;  	if (cb->list.next && !list_empty(&cb->list)) {  		spin_lock_irq(&conf->device_lock); @@ -4271,17 +4443,20 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)  			 * STRIPE_ON_UNPLUG_LIST clear but the stripe  			 * is still in our list  			 */ -			smp_mb__before_clear_bit(); +			smp_mb__before_atomic();  			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);  			/*  			 * STRIPE_ON_RELEASE_LIST could be set here. In that  			 * case, the count is always > 1 here  			 */ -			__release_stripe(conf, sh); +			hash = sh->hash_lock_index; +			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);  			cnt++;  		}  		spin_unlock_irq(&conf->device_lock);  	} +	release_inactive_stripe_list(conf, cb->temp_inactive_list, +				     NR_STRIPE_HASH_LOCKS);  	if (mddev->queue)  		trace_block_unplug(mddev->queue, cnt, !from_schedule);  	kfree(cb); @@ -4302,8 +4477,12 @@ static void release_stripe_plug(struct mddev *mddev,  	cb = container_of(blk_cb, struct raid5_plug_cb, cb); -	if (cb->list.next == NULL) +	if (cb->list.next == NULL) { +		int i;  		INIT_LIST_HEAD(&cb->list); +		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) +			INIT_LIST_HEAD(cb->temp_inactive_list + i); +	}  	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))  		list_add_tail(&sh->lru, &cb->list); @@ -4323,8 +4502,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)  		/* Skip discard while reshape is happening */  		return; -	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); -	last_sector = bi->bi_sector + (bi->bi_size>>9); +	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);  	bi->bi_next = NULL;  	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@ -4410,6 +4589,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)  	struct stripe_head *sh;  	const int rw = bio_data_dir(bi);  	int remaining; +	DEFINE_WAIT(w); +	bool do_prepare;  	if (unlikely(bi->bi_rw & REQ_FLUSH)) {  		md_flush_request(mddev, bi); @@ -4428,20 +4609,23 @@ static void make_request(struct mddev *mddev, struct bio * bi)  		return;  	} -	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);  	last_sector = bio_end_sector(bi);  	bi->bi_next = NULL;  	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */ +	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);  	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { -		DEFINE_WAIT(w);  		int previous;  		int seq; +		do_prepare = false;  	retry:  		seq = read_seqcount_begin(&conf->gen_lock);  		previous = 0; -		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); +		if (do_prepare) +			prepare_to_wait(&conf->wait_for_overlap, &w, +				TASK_UNINTERRUPTIBLE);  		if (unlikely(conf->reshape_progress != MaxSector)) {  			/* spinlock is needed as reshape_progress may be  			 * 64bit on a 32bit platform, and so it might be @@ -4462,6 +4646,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)  				    : logical_sector >= conf->reshape_safe) {  					spin_unlock_irq(&conf->device_lock);  					schedule(); +					do_prepare = true;  					goto retry;  				}  			} @@ -4498,6 +4683,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)  				if (must_retry) {  					release_stripe(sh);  					schedule(); +					do_prepare = true;  					goto retry;  				}  			} @@ -4521,8 +4707,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)  				prepare_to_wait(&conf->wait_for_overlap,  						&w, TASK_INTERRUPTIBLE);  				if (logical_sector >= mddev->suspend_lo && -				    logical_sector < mddev->suspend_hi) +				    logical_sector < mddev->suspend_hi) {  					schedule(); +					do_prepare = true; +				}  				goto retry;  			} @@ -4535,9 +4723,9 @@ static void make_request(struct mddev *mddev, struct bio * bi)  				md_wakeup_thread(mddev->thread);  				release_stripe(sh);  				schedule(); +				do_prepare = true;  				goto retry;  			} -			finish_wait(&conf->wait_for_overlap, &w);  			set_bit(STRIPE_HANDLE, &sh->state);  			clear_bit(STRIPE_DELAYED, &sh->state);  			if ((bi->bi_rw & REQ_SYNC) && @@ -4547,10 +4735,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)  		} else {  			/* cannot get stripe for read-ahead, just give-up */  			clear_bit(BIO_UPTODATE, &bi->bi_flags); -			finish_wait(&conf->wait_for_overlap, &w);  			break;  		}  	} +	finish_wait(&conf->wait_for_overlap, &w);  	remaining = raid5_dec_bi_active_stripes(bi);  	if (remaining == 0) { @@ -4686,14 +4874,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {  		/* Cannot proceed until we've updated the superblock... */  		wait_event(conf->wait_for_overlap, -			   atomic_read(&conf->reshape_stripes)==0); +			   atomic_read(&conf->reshape_stripes)==0 +			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); +		if (atomic_read(&conf->reshape_stripes) != 0) +			return 0;  		mddev->reshape_position = conf->reshape_progress;  		mddev->curr_resync_completed = sector_nr;  		conf->reshape_checkpoint = jiffies;  		set_bit(MD_CHANGE_DEVS, &mddev->flags);  		md_wakeup_thread(mddev->thread);  		wait_event(mddev->sb_wait, mddev->flags == 0 || -			   kthread_should_stop()); +			   test_bit(MD_RECOVERY_INTR, &mddev->recovery)); +		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) +			return 0;  		spin_lock_irq(&conf->device_lock);  		conf->reshape_safe = mddev->reshape_position;  		spin_unlock_irq(&conf->device_lock); @@ -4776,7 +4969,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	    >= mddev->resync_max - mddev->curr_resync_completed) {  		/* Cannot proceed until we've updated the superblock... */  		wait_event(conf->wait_for_overlap, -			   atomic_read(&conf->reshape_stripes) == 0); +			   atomic_read(&conf->reshape_stripes) == 0 +			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); +		if (atomic_read(&conf->reshape_stripes) != 0) +			goto ret;  		mddev->reshape_position = conf->reshape_progress;  		mddev->curr_resync_completed = sector_nr;  		conf->reshape_checkpoint = jiffies; @@ -4784,13 +4980,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  		md_wakeup_thread(mddev->thread);  		wait_event(mddev->sb_wait,  			   !test_bit(MD_CHANGE_DEVS, &mddev->flags) -			   || kthread_should_stop()); +			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); +		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) +			goto ret;  		spin_lock_irq(&conf->device_lock);  		conf->reshape_safe = mddev->reshape_position;  		spin_unlock_irq(&conf->device_lock);  		wake_up(&conf->wait_for_overlap);  		sysfs_notify(&mddev->kobj, NULL, "sync_completed");  	} +ret:  	return reshape_sectors;  } @@ -4875,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int  	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);  	set_bit(STRIPE_SYNC_REQUESTED, &sh->state); +	set_bit(STRIPE_HANDLE, &sh->state); -	handle_stripe(sh);  	release_stripe(sh);  	return STRIPE_SECTORS; @@ -4901,7 +5100,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)  	int remaining;  	int handled = 0; -	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +	logical_sector = raid_bio->bi_iter.bi_sector & +		~((sector_t)STRIPE_SECTORS-1);  	sector = raid5_compute_sector(conf, logical_sector,  				      0, &dd_idx, NULL);  	last_sector = bio_end_sector(raid_bio); @@ -4915,7 +5115,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)  			/* already done this stripe */  			continue; -		sh = get_active_stripe(conf, sector, 0, 1, 0); +		sh = get_active_stripe(conf, sector, 0, 1, 1);  		if (!sh) {  			/* failed to get a stripe - must wait */ @@ -4948,27 +5148,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)  }  static int handle_active_stripes(struct r5conf *conf, int group, -				 struct r5worker *worker) +				 struct r5worker *worker, +				 struct list_head *temp_inactive_list)  {  	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; -	int i, batch_size = 0; +	int i, batch_size = 0, hash; +	bool release_inactive = false;  	while (batch_size < MAX_STRIPE_BATCH &&  			(sh = __get_priority_stripe(conf, group)) != NULL)  		batch[batch_size++] = sh; -	if (batch_size == 0) -		return batch_size; +	if (batch_size == 0) { +		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) +			if (!list_empty(temp_inactive_list + i)) +				break; +		if (i == NR_STRIPE_HASH_LOCKS) +			return batch_size; +		release_inactive = true; +	}  	spin_unlock_irq(&conf->device_lock); +	release_inactive_stripe_list(conf, temp_inactive_list, +				     NR_STRIPE_HASH_LOCKS); + +	if (release_inactive) { +		spin_lock_irq(&conf->device_lock); +		return 0; +	} +  	for (i = 0; i < batch_size; i++)  		handle_stripe(batch[i]);  	cond_resched();  	spin_lock_irq(&conf->device_lock); -	for (i = 0; i < batch_size; i++) -		__release_stripe(conf, batch[i]); +	for (i = 0; i < batch_size; i++) { +		hash = batch[i]->hash_lock_index; +		__release_stripe(conf, batch[i], &temp_inactive_list[hash]); +	}  	return batch_size;  } @@ -4989,9 +5207,10 @@ static void raid5_do_work(struct work_struct *work)  	while (1) {  		int batch_size, released; -		released = release_stripe_list(conf); +		released = release_stripe_list(conf, worker->temp_inactive_list); -		batch_size = handle_active_stripes(conf, group_id, worker); +		batch_size = handle_active_stripes(conf, group_id, worker, +						   worker->temp_inactive_list);  		worker->working = false;  		if (!batch_size && !released)  			break; @@ -5030,7 +5249,7 @@ static void raid5d(struct md_thread *thread)  		struct bio *bio;  		int batch_size, released; -		released = release_stripe_list(conf); +		released = release_stripe_list(conf, conf->temp_inactive_list);  		if (  		    !list_empty(&conf->bitmap_list)) { @@ -5040,7 +5259,7 @@ static void raid5d(struct md_thread *thread)  			bitmap_unplug(mddev->bitmap);  			spin_lock_irq(&conf->device_lock);  			conf->seq_write = conf->seq_flush; -			activate_bit_delay(conf); +			activate_bit_delay(conf, conf->temp_inactive_list);  		}  		raid5_activate_delayed(conf); @@ -5054,7 +5273,8 @@ static void raid5d(struct md_thread *thread)  			handled++;  		} -		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); +		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, +						   conf->temp_inactive_list);  		if (!batch_size && !released)  			break;  		handled += batch_size; @@ -5090,22 +5310,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)  {  	struct r5conf *conf = mddev->private;  	int err; +	int hash;  	if (size <= 16 || size > 32768)  		return -EINVAL; +	hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;  	while (size < conf->max_nr_stripes) { -		if (drop_one_stripe(conf)) +		if (drop_one_stripe(conf, hash))  			conf->max_nr_stripes--;  		else  			break; +		hash--; +		if (hash < 0) +			hash = NR_STRIPE_HASH_LOCKS - 1;  	}  	err = md_allow_write(mddev);  	if (err)  		return err; +	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;  	while (size > conf->max_nr_stripes) { -		if (grow_one_stripe(conf)) +		if (grow_one_stripe(conf, hash))  			conf->max_nr_stripes++;  		else break; +		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;  	}  	return 0;  } @@ -5171,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,  					raid5_store_preread_threshold);  static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ +	struct r5conf *conf = mddev->private; +	if (conf) +		return sprintf(page, "%d\n", conf->skip_copy); +	else +		return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ +	struct r5conf *conf = mddev->private; +	unsigned long new; +	if (len >= PAGE_SIZE) +		return -EINVAL; +	if (!conf) +		return -ENODEV; + +	if (kstrtoul(page, 10, &new)) +		return -EINVAL; +	new = !!new; +	if (new == conf->skip_copy) +		return len; + +	mddev_suspend(mddev); +	conf->skip_copy = new; +	if (new) +		mddev->queue->backing_dev_info.capabilities |= +						BDI_CAP_STABLE_WRITES; +	else +		mddev->queue->backing_dev_info.capabilities &= +						~BDI_CAP_STABLE_WRITES; +	mddev_resume(mddev); +	return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, +					raid5_show_skip_copy, +					raid5_store_skip_copy); + + +static ssize_t  stripe_cache_active_show(struct mddev *mddev, char *page)  {  	struct r5conf *conf = mddev->private; @@ -5193,15 +5464,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)  		return 0;  } -static int alloc_thread_groups(struct r5conf *conf, int cnt); +static int alloc_thread_groups(struct r5conf *conf, int cnt, +			       int *group_cnt, +			       int *worker_cnt_per_group, +			       struct r5worker_group **worker_groups);  static ssize_t  raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)  {  	struct r5conf *conf = mddev->private;  	unsigned long new;  	int err; -	struct r5worker_group *old_groups; -	int old_group_cnt; +	struct r5worker_group *new_groups, *old_groups; +	int group_cnt, worker_cnt_per_group;  	if (len >= PAGE_SIZE)  		return -EINVAL; @@ -5217,14 +5491,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)  	mddev_suspend(mddev);  	old_groups = conf->worker_groups; -	old_group_cnt = conf->worker_cnt_per_group; +	if (old_groups) +		flush_workqueue(raid5_wq); + +	err = alloc_thread_groups(conf, new, +				  &group_cnt, &worker_cnt_per_group, +				  &new_groups); +	if (!err) { +		spin_lock_irq(&conf->device_lock); +		conf->group_cnt = group_cnt; +		conf->worker_cnt_per_group = worker_cnt_per_group; +		conf->worker_groups = new_groups; +		spin_unlock_irq(&conf->device_lock); -	conf->worker_groups = NULL; -	err = alloc_thread_groups(conf, new); -	if (err) { -		conf->worker_groups = old_groups; -		conf->worker_cnt_per_group = old_group_cnt; -	} else {  		if (old_groups)  			kfree(old_groups[0].workers);  		kfree(old_groups); @@ -5247,6 +5526,7 @@ static struct attribute *raid5_attrs[] =  {  	&raid5_stripecache_active.attr,  	&raid5_preread_bypass_threshold.attr,  	&raid5_group_thread_cnt.attr, +	&raid5_skip_copy.attr,  	NULL,  };  static struct attribute_group raid5_attrs_group = { @@ -5254,40 +5534,47 @@ static struct attribute_group raid5_attrs_group = {  	.attrs = raid5_attrs,  }; -static int alloc_thread_groups(struct r5conf *conf, int cnt) +static int alloc_thread_groups(struct r5conf *conf, int cnt, +			       int *group_cnt, +			       int *worker_cnt_per_group, +			       struct r5worker_group **worker_groups)  { -	int i, j; +	int i, j, k;  	ssize_t size;  	struct r5worker *workers; -	conf->worker_cnt_per_group = cnt; +	*worker_cnt_per_group = cnt;  	if (cnt == 0) { -		conf->worker_groups = NULL; +		*group_cnt = 0; +		*worker_groups = NULL;  		return 0;  	} -	conf->group_cnt = num_possible_nodes(); +	*group_cnt = num_possible_nodes();  	size = sizeof(struct r5worker) * cnt; -	workers = kzalloc(size * conf->group_cnt, GFP_NOIO); -	conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * -				conf->group_cnt, GFP_NOIO); -	if (!conf->worker_groups || !workers) { +	workers = kzalloc(size * *group_cnt, GFP_NOIO); +	*worker_groups = kzalloc(sizeof(struct r5worker_group) * +				*group_cnt, GFP_NOIO); +	if (!*worker_groups || !workers) {  		kfree(workers); -		kfree(conf->worker_groups); -		conf->worker_groups = NULL; +		kfree(*worker_groups);  		return -ENOMEM;  	} -	for (i = 0; i < conf->group_cnt; i++) { +	for (i = 0; i < *group_cnt; i++) {  		struct r5worker_group *group; -		group = &conf->worker_groups[i]; +		group = &(*worker_groups)[i];  		INIT_LIST_HEAD(&group->handle_list);  		group->conf = conf;  		group->workers = workers + i * cnt;  		for (j = 0; j < cnt; j++) { -			group->workers[j].group = group; -			INIT_WORK(&group->workers[j].work, raid5_do_work); +			struct r5worker *worker = group->workers + j; +			worker->group = group; +			INIT_WORK(&worker->work, raid5_do_work); + +			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) +				INIT_LIST_HEAD(worker->temp_inactive_list + k);  		}  	} @@ -5318,23 +5605,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)  	return sectors * (raid_disks - conf->max_degraded);  } +static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ +	safe_put_page(percpu->spare_page); +	kfree(percpu->scribble); +	percpu->spare_page = NULL; +	percpu->scribble = NULL; +} + +static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ +	if (conf->level == 6 && !percpu->spare_page) +		percpu->spare_page = alloc_page(GFP_KERNEL); +	if (!percpu->scribble) +		percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + +	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { +		free_scratch_buffer(conf, percpu); +		return -ENOMEM; +	} + +	return 0; +} +  static void raid5_free_percpu(struct r5conf *conf)  { -	struct raid5_percpu *percpu;  	unsigned long cpu;  	if (!conf->percpu)  		return; -	get_online_cpus(); -	for_each_possible_cpu(cpu) { -		percpu = per_cpu_ptr(conf->percpu, cpu); -		safe_put_page(percpu->spare_page); -		kfree(percpu->scribble); -	}  #ifdef CONFIG_HOTPLUG_CPU  	unregister_cpu_notifier(&conf->cpu_notify);  #endif + +	get_online_cpus(); +	for_each_possible_cpu(cpu) +		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));  	put_online_cpus();  	free_percpu(conf->percpu); @@ -5361,15 +5668,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		if (conf->level == 6 && !percpu->spare_page) -			percpu->spare_page = alloc_page(GFP_KERNEL); -		if (!percpu->scribble) -			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - -		if (!percpu->scribble || -		    (conf->level == 6 && !percpu->spare_page)) { -			safe_put_page(percpu->spare_page); -			kfree(percpu->scribble); +		if (alloc_scratch_buffer(conf, percpu)) {  			pr_err("%s: failed memory allocation for cpu%ld\n",  			       __func__, cpu);  			return notifier_from_errno(-ENOMEM); @@ -5377,10 +5676,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: -		safe_put_page(percpu->spare_page); -		kfree(percpu->scribble); -		percpu->spare_page = NULL; -		percpu->scribble = NULL; +		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));  		break;  	default:  		break; @@ -5392,40 +5688,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,  static int raid5_alloc_percpu(struct r5conf *conf)  {  	unsigned long cpu; -	struct page *spare_page; -	struct raid5_percpu __percpu *allcpus; -	void *scribble; -	int err; +	int err = 0; -	allcpus = alloc_percpu(struct raid5_percpu); -	if (!allcpus) +	conf->percpu = alloc_percpu(struct raid5_percpu); +	if (!conf->percpu)  		return -ENOMEM; -	conf->percpu = allcpus; + +#ifdef CONFIG_HOTPLUG_CPU +	conf->cpu_notify.notifier_call = raid456_cpu_notify; +	conf->cpu_notify.priority = 0; +	err = register_cpu_notifier(&conf->cpu_notify); +	if (err) +		return err; +#endif  	get_online_cpus(); -	err = 0;  	for_each_present_cpu(cpu) { -		if (conf->level == 6) { -			spare_page = alloc_page(GFP_KERNEL); -			if (!spare_page) { -				err = -ENOMEM; -				break; -			} -			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; -		} -		scribble = kmalloc(conf->scribble_len, GFP_KERNEL); -		if (!scribble) { -			err = -ENOMEM; +		err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); +		if (err) { +			pr_err("%s: failed memory allocation for cpu%ld\n", +			       __func__, cpu);  			break;  		} -		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;  	} -#ifdef CONFIG_HOTPLUG_CPU -	conf->cpu_notify.notifier_call = raid456_cpu_notify; -	conf->cpu_notify.priority = 0; -	if (err == 0) -		err = register_cpu_notifier(&conf->cpu_notify); -#endif  	put_online_cpus();  	return err; @@ -5438,6 +5723,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	struct md_rdev *rdev;  	struct disk_info *disk;  	char pers_name[6]; +	int i; +	int group_cnt, worker_cnt_per_group; +	struct r5worker_group *new_group;  	if (mddev->new_level != 5  	    && mddev->new_level != 4 @@ -5472,7 +5760,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	if (conf == NULL)  		goto abort;  	/* Don't enable multi-threading by default*/ -	if (alloc_thread_groups(conf, 0)) +	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, +				 &new_group)) { +		conf->group_cnt = group_cnt; +		conf->worker_cnt_per_group = worker_cnt_per_group; +		conf->worker_groups = new_group; +	} else  		goto abort;  	spin_lock_init(&conf->device_lock);  	seqcount_init(&conf->gen_lock); @@ -5482,7 +5775,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	INIT_LIST_HEAD(&conf->hold_list);  	INIT_LIST_HEAD(&conf->delayed_list);  	INIT_LIST_HEAD(&conf->bitmap_list); -	INIT_LIST_HEAD(&conf->inactive_list);  	init_llist_head(&conf->released_stripes);  	atomic_set(&conf->active_stripes, 0);  	atomic_set(&conf->preread_active_stripes, 0); @@ -5508,6 +5800,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)  		goto abort; +	/* We init hash_locks[0] separately to that it can be used +	 * as the reference lock in the spin_lock_nest_lock() call +	 * in lock_all_device_hash_locks_irq in order to convince +	 * lockdep that we know what we are doing. +	 */ +	spin_lock_init(conf->hash_locks); +	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) +		spin_lock_init(conf->hash_locks + i); + +	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) +		INIT_LIST_HEAD(conf->inactive_list + i); + +	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) +		INIT_LIST_HEAD(conf->temp_inactive_list + i); +  	conf->level = mddev->new_level;  	if (raid5_alloc_percpu(conf) != 0)  		goto abort; @@ -5548,7 +5855,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	else  		conf->max_degraded = 1;  	conf->algorithm = mddev->new_layout; -	conf->max_nr_stripes = NR_STRIPES;  	conf->reshape_progress = mddev->reshape_position;  	if (conf->reshape_progress != MaxSector) {  		conf->prev_chunk_sectors = mddev->chunk_sectors; @@ -5557,7 +5863,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +  		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; -	if (grow_stripes(conf, conf->max_nr_stripes)) { +	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); +	if (grow_stripes(conf, NR_STRIPES)) {  		printk(KERN_ERR  		       "md/raid:%s: couldn't allocate %dkB for buffers\n",  		       mdname(mddev), memory); @@ -5885,6 +6192,7 @@ static int run(struct mddev *mddev)  		blk_queue_io_min(mddev->queue, chunk_size);  		blk_queue_io_opt(mddev->queue, chunk_size *  				 (conf->raid_disks - conf->max_degraded)); +		mddev->queue->limits.raid_partial_stripes_expensive = 1;  		/*  		 * We can only discard a whole stripe. It doesn't make sense to  		 * discard data disk but write parity disk @@ -6363,12 +6671,18 @@ static int raid5_start_reshape(struct mddev *mddev)  	if (!mddev->sync_thread) {  		mddev->recovery = 0;  		spin_lock_irq(&conf->device_lock); +		write_seqcount_begin(&conf->gen_lock);  		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; +		mddev->new_chunk_sectors = +			conf->chunk_sectors = conf->prev_chunk_sectors; +		mddev->new_layout = conf->algorithm = conf->prev_algo;  		rdev_for_each(rdev, mddev)  			rdev->new_data_offset = rdev->data_offset;  		smp_wmb(); +		conf->generation --;  		conf->reshape_progress = MaxSector;  		mddev->reshape_position = MaxSector; +		write_seqcount_end(&conf->gen_lock);  		spin_unlock_irq(&conf->device_lock);  		return -EAGAIN;  	} @@ -6456,27 +6770,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)  		break;  	case 1: /* stop all writes */ -		spin_lock_irq(&conf->device_lock); +		lock_all_device_hash_locks_irq(conf);  		/* '2' tells resync/reshape to pause so that all  		 * active stripes can drain  		 */  		conf->quiesce = 2; -		wait_event_lock_irq(conf->wait_for_stripe, +		wait_event_cmd(conf->wait_for_stripe,  				    atomic_read(&conf->active_stripes) == 0 &&  				    atomic_read(&conf->active_aligned_reads) == 0, -				    conf->device_lock); +				    unlock_all_device_hash_locks_irq(conf), +				    lock_all_device_hash_locks_irq(conf));  		conf->quiesce = 1; -		spin_unlock_irq(&conf->device_lock); +		unlock_all_device_hash_locks_irq(conf);  		/* allow reshape to continue */  		wake_up(&conf->wait_for_overlap);  		break;  	case 0: /* re-enable writes */ -		spin_lock_irq(&conf->device_lock); +		lock_all_device_hash_locks_irq(conf);  		conf->quiesce = 0;  		wake_up(&conf->wait_for_stripe);  		wake_up(&conf->wait_for_overlap); -		spin_unlock_irq(&conf->device_lock); +		unlock_all_device_hash_locks_irq(conf);  		break;  	}  }  | 
