diff options
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 9 | ||||
-rw-r--r-- | crypto/xor.c | 4 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 124 | ||||
-rw-r--r-- | drivers/md/linear.c | 25 | ||||
-rw-r--r-- | drivers/md/md.c | 145 | ||||
-rw-r--r-- | drivers/md/md.h | 9 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 19 | ||||
-rw-r--r-- | drivers/md/raid1.c | 37 | ||||
-rw-r--r-- | drivers/md/raid10.c | 95 | ||||
-rw-r--r-- | drivers/md/raid5.c | 219 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
13 files changed, 578 insertions, 129 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 1c184495716..728c38c242d 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -132,3 +132,12 @@ Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. Faulty or missing devices are marked 'D'. Devices that are out-of-sync are marked 'a'. + + +Version History +--------------- +1.0.0 Initial version. Support for RAID 4/5/6 +1.1.0 Added support for RAID 1 +1.2.0 Handle creation of arrays that contain failed devices. +1.3.0 Added support for RAID 10 +1.3.1 Allow device replacement/rebuild for RAID 10 diff --git a/crypto/xor.c b/crypto/xor.c index 65c7b416b4a..35d6b3adf23 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -56,11 +56,11 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) EXPORT_SYMBOL(xor_blocks); /* Set of all registered templates. */ -static struct xor_block_template *template_list; +static struct xor_block_template *__initdata template_list; #define BENCH_SIZE (PAGE_SIZE) -static void +static void __init do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) { int speed; diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 94e7f6ba2e1..7155945f8eb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. + * list_for_each_entry_continue_rcu. */ - struct list_head *pos; rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ - pos = &mddev->disks; + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, struct md_rdev, same_set); + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ @@ -473,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - int err = -EINVAL; bitmap->storage.sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->storage.sb_page)) { - err = PTR_ERR(bitmap->storage.sb_page); - bitmap->storage.sb_page = NULL; - return err; - } + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; bitmap->storage.sb_page->index = 0; sb = kmap_atomic(bitmap->storage.sb_page); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 982e3e390c4..45d94a7e7f6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -338,6 +338,84 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) } /* + * validate_rebuild_devices + * @rs + * + * Determine if the devices specified for rebuild can result in a valid + * usable array that is capable of rebuilding the given devices. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_rebuild_devices(struct raid_set *rs) +{ + unsigned i, rebuild_cnt = 0; + unsigned rebuilds_per_group, copies, d; + + if (!(rs->print_flags & DMPF_REBUILD)) + return 0; + + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild_cnt++; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + copies = raid10_md_layout_to_copies(rs->md.layout); + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * Right now, we only allow for "near" copies. When other + * formats are added, we will have to check those too. + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + rebuilds_per_group = 0; + for (i = 0; i < rs->md.raid_disks * copies; i++) { + d = i % rs->md.raid_disks; + if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && + (++rebuilds_per_group >= copies)) + goto too_many; + if (!((i + 1) % copies)) + rebuilds_per_group = 0; + } + break; + default: + DMERR("The rebuild parameter is not supported for %s", + rs->raid_type->name); + rs->ti->error = "Rebuild not supported for this RAID type"; + return -EINVAL; + } + + return 0; + +too_many: + rs->ti->error = "Too many rebuild devices specified"; + return -EINVAL; +} + +/* * Possible arguments are... * <chunk_size> [optional_args] * @@ -365,7 +443,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { char *raid10_format = "near"; unsigned raid10_copies = 2; - unsigned i, rebuild_cnt = 0; + unsigned i; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; sector_t max_io_len; @@ -461,31 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - rebuild_cnt++; - - switch (rs->raid_type->level) { - case 1: - if (rebuild_cnt >= rs->md.raid_disks) { - rs->ti->error = "Too many rebuild devices specified"; - return -EINVAL; - } - break; - case 4: - case 5: - case 6: - if (rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; - return -EINVAL; - } - break; - case 10: - default: - DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); - rs->ti->error = "Rebuild not supported for this RAID type"; - return -EINVAL; - } - - if (value > rs->md.raid_disks) { + if (value >= rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; } @@ -608,6 +662,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } rs->md.dev_sectors = sectors_per_dev; + if (validate_rebuild_devices(rs)) + return -EINVAL; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -960,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) { + /* + * Skipping super_load due to DMPF_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * When reshaping capability is added, we must ensure + * that the "sync" directive is disallowed during the + * reshape. + */ + if (rs->print_flags & DMPF_SYNC) + continue; + if (!rdev->meta_bdev) continue; @@ -1360,7 +1430,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa211d80fc0..21014836bdb 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) struct linear_conf *conf; struct md_rdev *rdev; int i, cnt; + bool discard_supported = false; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), GFP_KERNEL); @@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->array_sectors += rdev->sectors; cnt++; + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } if (cnt != raid_disks) { printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", @@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) goto out; } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* * Here we calculate the device offsets. */ @@ -244,7 +252,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference(mddev->private); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); mddev->raid_disks++; rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); @@ -256,7 +266,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) static int linear_stop (struct mddev *mddev) { - struct linear_conf *conf = mddev->private; + struct linear_conf *conf = + rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); /* * We do not require rcu protection here since @@ -326,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; rcu_read_unlock(); + + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 95c88012a3b..9ab768acfb6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -674,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) return NULL; } -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -685,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) return NULL; } +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; @@ -2022,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; @@ -3754,6 +3782,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = @@ -4231,6 +4261,13 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); @@ -4241,7 +4278,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -4362,6 +4400,10 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; @@ -5207,7 +5249,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -5509,8 +5551,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) int nr,working,insync,failed,spare; struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5522,6 +5565,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5605,7 +5649,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5624,6 +5669,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } + rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -6232,18 +6278,22 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) static int set_disk_faulty(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; + int err = 0; if (mddev->pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -6315,6 +6365,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6387,18 +6458,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, */ switch (cmd) { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto done_unlock; @@ -6480,10 +6543,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - case RUN_ARRAY: err = do_md_run(mddev); goto done_unlock; @@ -6641,7 +6700,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6656,8 +6715,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -6752,7 +6811,11 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) int scale; unsigned int per_milli; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) @@ -6978,7 +7041,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->curr_resync > 2) { status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -7206,8 +7269,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; @@ -7311,7 +7375,7 @@ void md_do_sync(struct mddev *mddev) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; @@ -7367,8 +7431,11 @@ void md_do_sync(struct mddev *mddev) "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); blk_start_plug(&plug); while (j < max_sectors) { @@ -7421,7 +7488,8 @@ void md_do_sync(struct mddev *mddev) break; j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7543,8 +7611,6 @@ static int remove_and_add_spares(struct mddev *mddev) int spares = 0; int removed = 0; - mddev->curr_resync_completed = 0; - rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -7739,6 +7805,7 @@ void md_check_recovery(struct mddev *mddev) /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set @@ -7752,7 +7819,7 @@ void md_check_recovery(struct mddev *mddev) /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589..af443ab868d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ @@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *private; }; #define THREAD_WAKEUP 0 @@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf..1642eae75a3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a9e4fa95dfa..24b359717a7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + bool discard_supported = false; if (!conf) return -ENOMEM; @@ -195,6 +196,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) + discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -272,6 +276,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -423,6 +432,7 @@ static int raid0_run(struct mddev *mddev) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { @@ -510,7 +520,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -536,6 +546,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = sector_offset + zone->dev_start + tmp_dev->data_offset; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); return; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 611b5f79761..8034fbd6190 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -333,9 +333,10 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (uptodate) + if (uptodate) { raid_end_bio_io(r1_bio); - else { + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + } else { /* * oops, read error: */ @@ -349,9 +350,8 @@ static void raid1_end_read_request(struct bio *bio, int error) (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } static void close_write(struct r1bio *r1_bio) @@ -781,7 +781,12 @@ static void flush_pending_writes(struct r1conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -994,6 +999,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1295,7 +1302,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -1549,6 +1556,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -1867,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ @@ -2220,6 +2229,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) unfreeze_array(conf); } else md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); bio = r1_bio->bios[r1_bio->read_disk]; bdevname(bio->bi_bdev, b); @@ -2285,8 +2295,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; @@ -2783,6 +2794,7 @@ static int run(struct mddev *mddev) int i; struct md_rdev *rdev; int ret; + bool discard_supported = false; if (mddev->level != 1) { printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -2812,6 +2824,8 @@ static int run(struct mddev *mddev) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } mddev->degraded = 0; @@ -2846,6 +2860,13 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); + + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } ret = md_integrity_register(mddev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0138a727c1f..906ccbd0f7d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1050,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev |