diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 825 |
1 files changed, 570 insertions, 255 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ff4f252ca1..6234b2e8458 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) return &conf->stripe_hashtbl[hash]; } +static inline int stripe_hash_locks_hash(sector_t sect) +{ + return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + local_irq_disable(); + spin_lock(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS; i; i--) + spin_unlock(conf->hash_locks + i - 1); + local_irq_enable(); +} + /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and * a bio could span several devices. @@ -97,7 +133,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { int sectors = bio_sectors(bio); - if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) + if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) return bio->bi_next; else return NULL; @@ -189,7 +225,7 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; - bi->bi_size = 0; + bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); bio_endio(bi, 0); @@ -249,15 +285,19 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) } } -static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + if (atomic_read(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -278,37 +318,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + if (!test_bit(STRIPE_EXPANDING, &sh->state)) + list_add_tail(&sh->lru, temp_inactive_list); } } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { if (atomic_dec_and_test(&sh->count)) - do_release_stripe(conf, sh); + do_release_stripe(conf, sh, temp_inactive_list); } -static struct llist_node *llist_reverse_order(struct llist_node *head) +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) { - struct llist_node *new_head = NULL; + int size; + bool do_wakeup = false; + unsigned long flags; - while (head) { - struct llist_node *tmp = head; - head = head->next; - tmp->next = new_head; - new_head = tmp; + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; } - return new_head; + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } } /* should hold conf->device_lock already */ -static int release_stripe_list(struct r5conf *conf) +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) { struct stripe_head *sh; int count = 0; @@ -317,6 +388,8 @@ static int release_stripe_list(struct r5conf *conf) head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); while (head) { + int hash; + sh = llist_entry(head, struct stripe_head, release_list); head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ @@ -327,7 +400,8 @@ static int release_stripe_list(struct r5conf *conf) * again, the count is always > 1. This is true for * STRIPE_ON_UNPLUG_LIST bit too. */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); count++; } @@ -338,9 +412,17 @@ static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; + struct list_head list; + int hash; bool wakeup; - if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; wakeup = llist_add(&sh->release_list, &conf->released_stripes); if (wakeup) @@ -350,8 +432,11 @@ slow_path: local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { - do_release_stripe(conf, sh); + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); spin_unlock(&conf->device_lock); + release_inactive_stripe_list(conf, &list, hash); } local_irq_restore(flags); } @@ -376,18 +461,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh = NULL; struct list_head *first; - if (list_empty(&conf->inactive_list)) + if (list_empty(conf->inactive_list + hash)) goto out; - first = conf->inactive_list.next; + first = (conf->inactive_list + hash)->next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); out: return sh; } @@ -399,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh) int num = sh->raid_conf->pool_size; for (i = 0; i < num ; i++) { + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) continue; @@ -419,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh) return 1; } sh->dev[i].page = page; + sh->dev[i].orig_page = page; } return 0; } @@ -430,7 +520,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { struct r5conf *conf = sh->raid_conf; - int i; + int i, seq; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -440,7 +530,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector); remove_hash(sh); - +retry: + seq = read_seqcount_begin(&conf->gen_lock); sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; @@ -462,6 +553,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) dev->flags = 0; raid5_build_block(sh, i, previous); } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; insert_hash(conf, sh); sh->cpu = smp_processor_id(); } @@ -566,57 +659,55 @@ get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; + int hash = stripe_hash_locks_hash(sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - spin_lock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0 || noquiesce, - conf->device_lock); + *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) - sh = get_free_stripe(conf); + sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock); + wait_event_lock_irq( + conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes * 3 / 4) + || !conf->inactive_blocked), + *(conf->hash_locks + hash)); conf->inactive_blocked = 0; - } else - init_stripe(sh, sector, previous); - } else { - if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state) - && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); } else { + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + } + } else if (!atomic_inc_not_zero(&sh->count)) { + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); } } while (sh == NULL); - if (sh) - atomic_inc(&sh->count); - - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -766,18 +857,27 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_rw, i); atomic_inc(&sh->count); if (use_new_offset(conf, sh)) - bi->bi_sector = (sh->sector + bi->bi_iter.bi_sector = (sh->sector + rdev->new_data_offset); else - bi->bi_sector = (sh->sector + bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) - bi->bi_rw |= REQ_FLUSH; + bi->bi_rw |= REQ_NOMERGE; + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; + bi->bi_iter.bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + bi->bi_vcnt = 0; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -807,15 +907,24 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_rw, i); atomic_inc(&sh->count); if (use_new_offset(conf, sh)) - rbi->bi_sector = (sh->sector + rbi->bi_iter.bi_sector = (sh->sector + rrdev->new_data_offset); else - rbi->bi_sector = (sh->sector + rbi->bi_iter.bi_sector = (sh->sector + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_size = STRIPE_SIZE; + rbi->bi_iter.bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + rbi->bi_vcnt = 0; if (conf->mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), @@ -834,27 +943,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, + sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh) { - struct bio_vec *bvl; + struct bio_vec bvl; + struct bvec_iter iter; struct page *bio_page; - int i; int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; + if (bio->bi_iter.bi_sector >= sector) + page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; else - page_offset = (signed)(sector - bio->bi_sector) * -512; + page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; if (frombio) flags |= ASYNC_TX_FENCE; init_async_submit(&submit, flags, tx, NULL, NULL, NULL); - bio_for_each_segment(bvl, bio, i) { - int len = bvl->bv_len; + bio_for_each_segment(bvl, bio, iter) { + int len = bvl.bv_len; int clen; int b_offset = 0; @@ -870,13 +980,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, clen = len; if (clen > 0) { - b_offset += bvl->bv_offset; - bio_page = bvl->bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, + b_offset += bvl.bv_offset; + bio_page = bvl.bv_page; + if (frombio) { + if (sh->raid_conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == STRIPE_SIZE) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset, b_offset, clen, &submit); - else - tx = async_memcpy(bio_page, page, b_offset, + } else + tx = async_memcpy(bio_page, *page, b_offset, page_offset, clen, &submit); } /* chain the operations */ @@ -914,7 +1029,7 @@ static void ops_complete_biofill(void *stripe_head_ref) BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; - while (rbi && rbi->bi_sector < + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); if (!raid5_dec_bi_active_stripes(rbi)) { @@ -950,10 +1065,10 @@ static void ops_run_biofill(struct stripe_head *sh) dev->read = rbi = dev->toread; dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); - while (rbi && rbi->bi_sector < + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); + tx = async_copy_data(0, rbi, &dev->page, + dev->sector, tx, sh); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1291,8 +1406,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { if (wbi->bi_rw & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); @@ -1300,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_SyncIO, &dev->flags); if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - else - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->sector, tx, sh); + if (dev->page != dev->orig_page) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } wbi = r5_next_bio(wbi, dev->sector); } } @@ -1333,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); @@ -1584,7 +1706,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static int grow_one_stripe(struct r5conf *conf) +static int grow_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); @@ -1600,6 +1722,7 @@ static int grow_one_stripe(struct r5conf *conf) kmem_cache_free(conf->slab_cache, sh); return 0; } + sh->hash_lock_index = hash; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -1612,6 +1735,7 @@ static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); + int hash; if (conf->mddev->gendisk) sprintf(conf->cache_name[0], @@ -1629,9 +1753,13 @@ static int grow_stripes(struct r5conf *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (num--) { + if (!grow_one_stripe(conf, hash)) return 1; + conf->max_nr_stripes++; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; + } return 0; } @@ -1689,6 +1817,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) int err; struct kmem_cache *sc; int i; + int hash, cnt; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ @@ -1728,19 +1857,31 @@ static int resize_stripes(struct r5conf *conf, int newsize) * OK, we have enough stripes, start collecting inactive * stripes and copying them over */ + hash = 0; + cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) + for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + } for( ; i<newsize; i++) nsh->dev[i].page = NULL; + nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } } kmem_cache_destroy(conf->slab_cache); @@ -1786,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; if (!p) err = -ENOMEM; } @@ -1799,13 +1941,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) return err; } -static int drop_one_stripe(struct r5conf *conf) +static int drop_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); @@ -1817,8 +1959,10 @@ static int drop_one_stripe(struct r5conf *conf) static void shrink_stripes(struct r5conf *conf) { - while (drop_one_stripe(conf)) - ; + int hash; + for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) + while (drop_one_stripe(conf, hash)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -1923,6 +2067,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), bdn); else retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; if (retry) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); @@ -1991,6 +2138,7 @@ static void raid5_end_write_request(struct bio *bi, int error) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (!uptodate) { + set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2017,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error) } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - + static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; + dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; - dev->vec.bv_page = dev->page; bio_init(&dev->rreq); dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_vcnt++; - dev->rreq.bi_max_vecs++; + dev->rreq.bi_max_vecs = 1; dev->rreq.bi_private = sh; - dev->rvec.bv_page = dev->page; dev->flags = 0; dev->sector = compute_blocknr(sh, i, previous); @@ -2494,7 +2638,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in int firstwrite=0; pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_sector, + (unsigned long long)bi->bi_iter.bi_sector, (unsigned long long)sh->sector); /* @@ -2512,12 +2656,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in firstwrite = 1; } else bip = &sh->dev[dd_idx].toread; - while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if (bio_end_sector(*bip) > bi->bi_sector) + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { + if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) goto overlap; bip = & (*bip)->bi_next; } - if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); @@ -2531,7 +2675,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && - bi && bi->bi_sector <= sector; + bi && bi->bi_iter.bi_sector <= sector; bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); @@ -2541,7 +2685,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in } pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_sector, + (unsigned long long)(*bip)->bi_iter.bi_sector, (unsigned long long)sh->sector, dd_idx); spin_unlock_irq(&sh->stripe_lock); @@ -2616,7 +2760,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -2634,8 +2778,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -2659,7 +2808,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); @@ -2770,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + (sh->raid_conf->level == 6 && s->failed && s->to_write && + s->to_write < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -2875,15 +3027,20 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); if (test_and_clear_bit(R5_Discard, &dev->flags)) clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + dev->page = dev->orig_page; + } wbi = dev->written; dev->written = NULL; - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { @@ -2899,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, 0); } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); + WARN_ON(dev->page != dev->orig_page); } if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -2910,6 +3069,14 @@ static void handle_stripe_clean_event(struct r5conf *conf, } /* now that discard is done we can proceed with any sync */ clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ + spin_lock_irq(&conf->device_lock); + remove_hash(sh); + spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); @@ -2962,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; + if (test_bit(R5_Insync, &dev->flags)) + rcw++; else rcw += 2*disks; } @@ -2983,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -3009,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); @@ -3481,7 +3648,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) */ set_bit(R5_Insync, &dev->flags); - if (rdev && test_bit(R5_WriteError, &dev->flags)) { + if (test_bit(R5_WriteError, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@ -3494,7 +3661,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } else clear_bit(R5_WriteError, &dev->flags); } - if (rdev && test_bit(R5_MadeGood, &dev->flags)) { + if (test_bit(R5_MadeGood, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@ -3894,7 +4061,8 @@ static void raid5_activate_delayed(struct r5conf *conf) } } -static void activate_bit_delay(struct r5conf *conf) +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) { /* device_lock is held */ struct list_head head; @@ -3902,9 +4070,11 @@ static void activate_bit_delay(struct r5conf *conf) list_del_init(&conf->bitmap_list); while (!list_empty(&head)) { struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; list_del_init(&sh->lru); atomic_inc(&sh->count); - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); } } @@ -3920,7 +4090,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 1; if (conf->quiesce) return 1; - if (list_empty_careful(&conf->inactive_list)) + if (atomic_read(&conf->empty_inactive_list_nr)) return 1; return 0; @@ -3964,7 +4134,7 @@ static int raid5_mergeable_bvec(struct request_queue *q, static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); @@ -4101,9 +4271,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, - 0, - &dd_idx, NULL); + align_bi->bi_iter.bi_sector = + raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, + 0, &dd_idx, NULL); end_sector = bio_end_sector(align_bi); rcu_read_lock(); @@ -4128,7 +4298,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), + is_badblock(rdev, align_bi->bi_iter.bi_sector, + bio_sectors(align_bi), &first_bad, &bad_sectors)) { /* too big in some way, or has a known bad block */ bio_put(align_bi); @@ -4137,7 +4308,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) } /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; + align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, @@ -4149,7 +4320,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_sector); + raid_bio->bi_iter.bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4242,14 +4413,14 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) sh->group = NULL; } list_del_init(&sh->lru); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count) != 1); + BUG_ON(atomic_inc_return(&sh->count) != 1); return sh; } struct raid5_plug_cb { struct blk_plug_cb cb; struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; }; static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) @@ -4260,6 +4431,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; int cnt = 0; + int hash; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4271,17 +4443,20 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_UNPLUG_LIST clear but the stripe * is still in our list */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); /* * STRIPE_ON_RELEASE_LIST could be set here. In that * case, the count is always > 1 here */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); cnt++; } spin_unlock_irq(&conf->device_lock); } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); if (mddev->queue) trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); @@ -4302,8 +4477,12 @@ static void release_stripe_plug(struct mddev *mddev, cb = container_of(blk_cb, struct raid5_plug_cb, cb); - if (cb->list.next == NULL) + if (cb->list.next == NULL) { + int i; INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); @@ -4323,8 +4502,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@ -4410,6 +4589,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; + DEFINE_WAIT(w); + bool do_prepare; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -4428,20 +4609,23 @@ static void make_request(struct mddev *mddev, struct bio * bi) return; } - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); int previous; int seq; + do_prepare = false; retry: seq = read_seqcount_begin(&conf->gen_lock); previous = 0; - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + if (do_prepare) + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be @@ -4462,6 +4646,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); + do_prepare = true; goto retry; } } @@ -4498,6 +4683,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if (must_retry) { release_stripe(sh); schedule(); + do_prepare = true; goto retry; } } @@ -4521,8 +4707,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) prepare_to_wait(&conf->wait_for_overlap, &w, TASK_INTERRUPTIBLE); if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) + logical_sector < mddev->suspend_hi) { schedule(); + do_prepare = true; + } goto retry; } @@ -4535,9 +4723,9 @@ static void make_request(struct mddev *mddev, struct bio * bi) md_wakeup_thread(mddev->thread); release_stripe(sh); schedule(); + do_prepare = true; goto retry; } - finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((bi->bi_rw & REQ_SYNC) && @@ -4547,10 +4735,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); break; } } + finish_wait(&conf->wait_for_overlap, &w); remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { @@ -4686,14 +4874,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); @@ -4776,7 +4969,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; @@ -4784,13 +4980,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } +ret: return reshape_sectors; } @@ -4875,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); - handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS; @@ -4901,7 +5100,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) int remaining; int handled = 0; - logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = raid_bio->bi_iter.bi_sector & + ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); @@ -4915,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 0); + sh = get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -4948,27 +5148,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } static int handle_active_stripes(struct r5conf *conf, int group, - struct r5worker *worker) + struct r5worker *worker, + struct list_head *temp_inactive_list) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; - int i, batch_size = 0; + int i, batch_size = 0, hash; + bool release_inactive = false; while (batch_size < MAX_STRIPE_BATCH && (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; - if (batch_size == 0) - return batch_size; + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) + return batch_size; + release_inactive = true; + } spin_unlock_irq(&conf->device_lock); + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); - for (i = 0; i < batch_size; i++) - __release_stripe(conf, batch[i]); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } return batch_size; } @@ -4989,9 +5207,10 @@ static void raid5_do_work(struct work_struct *work) while (1) { int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, worker->temp_inactive_list); - batch_size = handle_active_stripes(conf, group_id, worker); + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); worker->working = false; if (!batch_size && !released) break; @@ -5030,7 +5249,7 @@ static void raid5d(struct md_thread *thread) struct bio *bio; int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, conf->temp_inactive_list); if ( !list_empty(&conf->bitmap_list)) { @@ -5040,7 +5259,7 @@ static void raid5d(struct md_thread *thread) bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); + activate_bit_delay(conf, conf->temp_inactive_list); } raid5_activate_delayed(conf); @@ -5054,7 +5273,8 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); if (!batch_size && !released) break; handled += batch_size; @@ -5090,22 +5310,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; int err; + int hash; if (size <= 16 || size > 32768) return -EINVAL; + hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) + if (drop_one_stripe(conf, hash)) conf->max_nr_stripes--; else break; + hash--; + if (hash < 0) + hash = NR_STRIPE_HASH_LOCKS - 1; } err = md_allow_write(mddev); if (err) return err; + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) + if (grow_one_stripe(conf, hash)) conf->max_nr_stripes++; else break; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; } return 0; } @@ -5171,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, raid5_store_preread_threshold); static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->skip_copy); + else + return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + if (new == conf->skip_copy) + return len; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + + +static ssize_t stripe_cache_active_show(struct mddev *mddev, char *page) { struct r5conf *conf = mddev->private; @@ -5193,15 +5464,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) return 0; } -static int alloc_thread_groups(struct r5conf *conf, int cnt); +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf = mddev->private; unsigned long new; int err; - struct r5worker_group *old_groups; - int old_group_cnt; + struct r5worker_group *new_groups, *old_groups; + int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; @@ -5217,14 +5491,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); old_groups = conf->worker_groups; - old_group_cnt = conf->worker_cnt_per_group; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - conf->worker_groups = NULL; - err = alloc_thread_groups(conf, new); - if (err) { - conf->worker_groups = old_groups; - conf->worker_cnt_per_group = old_group_cnt; - } else { if (old_groups) kfree(old_groups[0].workers); kfree(old_groups); @@ -5247,6 +5526,7 @@ static struct attribute *raid5_attrs[] = { &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -5254,40 +5534,47 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt) +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups) { - int i, j; + int i, j, k; ssize_t size; struct r5worker *workers; - conf->worker_cnt_per_group = cnt; + *worker_cnt_per_group = cnt; if (cnt == 0) { - conf->worker_groups = NULL; + *group_cnt = 0; + *worker_groups = NULL; return 0; } - conf->group_cnt = num_possible_nodes(); + *group_cnt = num_possible_nodes(); size = sizeof(struct r5worker) * cnt; - workers = kzalloc(size * conf->group_cnt, GFP_NOIO); - conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * - conf->group_cnt, GFP_NOIO); - if (!conf->worker_groups || !workers) { + workers = kzalloc(size * *group_cnt, GFP_NOIO); + *worker_groups = kzalloc(sizeof(struct r5worker_group) * + *group_cnt, GFP_NOIO); + if (!*worker_groups || !workers) { kfree(workers); - kfree(conf->worker_groups); - conf->worker_groups = NULL; + kfree(*worker_groups); return -ENOMEM; } - for (i = 0; i < conf->group_cnt; i++) { + for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = &conf->worker_groups[i]; + group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; for (j = 0; j < cnt; j++) { - group->workers[j].group = group; - INIT_WORK(&group->workers[j].work, raid5_do_work); + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); } } @@ -5318,23 +5605,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + percpu->spare_page = NULL; + percpu->scribble = NULL; +} + +static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + if (conf->level == 6 && !percpu->spare_page) + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->scribble) + percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + + if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { + free_scratch_buffer(conf, percpu); + return -ENOMEM; + } + + return 0; +} + static void raid5_free_percpu(struct r5conf *conf) { - struct raid5_percpu *percpu; unsigned long cpu; if (!conf->percpu) return; - get_online_cpus(); - for_each_possible_cpu(cpu) { - percpu = per_cpu_ptr(conf->percpu, cpu); - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - } #ifdef CONFIG_HOTPLUG_CPU unregister_cpu_notifier(&conf->cpu_notify); #endif + + get_online_cpus(); + for_each_possible_cpu(cpu) + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); put_online_cpus(); free_percpu(conf->percpu); @@ -5361,15 +5668,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (conf->level == 6 && !percpu->spare_page) - percpu->spare_page = alloc_page(GFP_KERNEL); - if (!percpu->scribble) - percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - - if (!percpu->scribble || - (conf->level == 6 && !percpu->spare_page)) { - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); + if (alloc_scratch_buffer(conf, percpu)) { pr_err("%s: failed memory allocation for cpu%ld\n", __func__, cpu); return notifier_from_errno(-ENOMEM); @@ -5377,10 +5676,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, break; case CPU_DEAD: case CPU_DEAD_FROZEN: - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - percpu->spare_page = NULL; - percpu->scribble = NULL; + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); break; default: break; @@ -5392,40 +5688,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, static int raid5_alloc_percpu(struct r5conf *conf) { unsigned long cpu; - struct page *spare_page; - struct raid5_percpu __percpu *allcpus; - void *scribble; - int err; + int err = 0; - allcpus = alloc_percpu(struct raid5_percpu); - if (!allcpus) + conf->percpu = alloc_percpu(struct raid5_percpu); + if (!conf->percpu) return -ENOMEM; - conf->percpu = allcpus; + +#ifdef CONFIG_HOTPLUG_CPU + conf->cpu_notify.notifier_call = raid456_cpu_notify; + conf->cpu_notify.priority = 0; + err = register_cpu_notifier(&conf->cpu_notify); + if (err) + return err; +#endif get_online_cpus(); - err = 0; for_each_present_cpu(cpu) { - if (conf->level == 6) { - spare_page = alloc_page(GFP_KERNEL); - if (!spare_page) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; - } - scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - if (!scribble) { - err = -ENOMEM; + err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + if (err) { + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); break; } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; } -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - if (err == 0) - err = register_cpu_notifier(&conf->cpu_notify); -#endif put_online_cpus(); return err; @@ -5438,6 +5723,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct md_rdev *rdev; struct disk_info *disk; char pers_name[6]; + int i; + int group_cnt, worker_cnt_per_group; + struct r5worker_group *new_group; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -5472,7 +5760,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf == NULL) goto abort; /* Don't enable multi-threading by default*/ - if (alloc_thread_groups(conf, 0)) + if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, + &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_group; + } else goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); @@ -5482,7 +5775,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -5508,6 +5800,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + conf->level = mddev->new_level; if (raid5_alloc_percpu(conf) != 0) goto abort; @@ -5548,7 +5855,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) else conf->max_degraded = 1; conf->algorithm = mddev->new_layout; - conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; @@ -5557,7 +5863,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, NR_STRIPES)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -5885,6 +6192,7 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + mddev->queue->limits.raid_partial_stripes_expensive = 1; /* * We can only discard a whole stripe. It doesn't make sense to * discard data disk but write parity disk @@ -6363,12 +6671,18 @@ static int raid5_start_reshape(struct mddev *mddev) if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; smp_wmb(); + conf->generation --; conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -6456,27 +6770,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) break; case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain */ conf->quiesce = 2; - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock); + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); conf->quiesce = 1; - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); break; } } |
