From eea1bf384e05b5ab747f8530c4fba9e9e6907fff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: Fix is_mddev_idle test (again). There are two problems with is_mddev_idle. 1/ sync_io is 'atomic_t' and hence 'int'. curr_events and all the rest are 'long'. So if sync_io were to wrap on a 64bit host, the value of curr_events would go very negative suddenly, and take a very long time to return to positive. So do all calculations as 'int'. That gives us plenty of precision for what we need. 2/ To initialise rdev->last_events we simply call is_mddev_idle, on the assumption that it will make sure that last_events is in a suitable range. It used to do this, but now it does not. So now we need to be more explicit about initialisation. Signed-off-by: NeilBrown --- drivers/md/md.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 03b4cd0a634..a99c50e217c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5716,19 +5716,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(mddev_t *mddev, int init) { mdk_rdev_t * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = part_stat_read(&disk->part0, sectors[0]) + - part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5751,7 +5751,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5994,7 +5994,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -6096,7 +6096,7 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } -- cgit v1.2.3-18-g5258 From 3f9d99c12a533809342b475c95452e82761bcc1c Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: MD data integrity support md: Add support for data integrity to MD If all subdevices support the same protection format the MD device is flagged as integrity capable. Signed-off-by: Martin K. Petersen Signed-off-by: NeilBrown --- drivers/md/md.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index a99c50e217c..f30b4618c02 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1438,6 +1438,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); +static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +{ + struct mdk_personality *pers = mddev->pers; + struct gendisk *disk = mddev->gendisk; + struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); + struct blk_integrity *bi_mddev = blk_get_integrity(disk); + + /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ + if (pers && pers->level >= 4 && pers->level <= 6) + return; + + /* If rdev is integrity capable, register profile for mddev */ + if (!bi_mddev && bi_rdev) { + if (blk_integrity_register(disk, bi_rdev)) + printk(KERN_ERR "%s: %s Could not register integrity!\n", + __func__, disk->disk_name); + else + printk(KERN_NOTICE "Enabling data integrity on %s\n", + disk->disk_name); + return; + } + + /* Check that mddev and rdev have matching profiles */ + if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { + printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, + disk->disk_name, rdev->bdev->bd_disk->disk_name); + printk(KERN_NOTICE "Disabling data integrity on %s\n", + disk->disk_name); + blk_integrity_unregister(disk); + } +} + static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { char b[BDEVNAME_SIZE]; @@ -1508,6 +1540,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; + + md_integrity_check(rdev, mddev); return 0; fail: @@ -3794,6 +3828,10 @@ static int do_md_run(mddev_t * mddev) mddev->level = pers->level; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (pers->level >= 4 && pers->level <= 6) + /* Cannot support integrity (yet) */ + blk_integrity_unregister(mddev->gendisk); + if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -4129,6 +4167,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; + blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); out: -- cgit v1.2.3-18-g5258 From 3dbd8c2e3ff0185585e068f190289d2a267a3e83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: stop defining MAJOR_NR MAJOR_NR was only required for magic in linux/blk.h in 2.4 or earlier kernels, so no need to keep it around. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/md.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index f30b4618c02..3efc0bceada 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -46,8 +46,6 @@ #include #include -#define MAJOR_NR MD_MAJOR - /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 @@ -6503,13 +6501,13 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) + if (register_blkdev(MD_MAJOR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); + unregister_blkdev(MD_MAJOR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL< Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move headers out of include/linux/raid/ Move the headers with the local structures for the disciplines and bitmap.h into drivers/md/ so that they are more easily grepable for hacking and not far away. md.h is left where it is for now as there are some uses from the outside. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3efc0bceada..9a3214c8585 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,7 +34,6 @@ #include #include -#include #include #include /* for invalidate_bdev */ #include @@ -45,6 +44,7 @@ #include #include #include +#include "bitmap.h" /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 -- cgit v1.2.3-18-g5258 From 8b2b5c217c20b5460218ab8731295f2e46c7dd29 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move LEVEL_* definition from md_k.h to md_u.h .. as they are part of the user-space interface. Also move MdpMinorShift into there so we can remove duplication. Lastly move mdp_major in. It is less obviously part of the user-space interface, but do_mounts_md.c uses it, and it is acting a bit like user-space. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a3214c8585..96336b050b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -46,9 +46,6 @@ #include #include "bitmap.h" -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 - #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) -- cgit v1.2.3-18-g5258 From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move lots of #include lines out of .h files and into .c This makes the includes more explicit, and is preparation for moving md_k.h to drivers/md/md.h Remove include/raid/md.h as its only remaining use was to #include other files. Signed-off-by: NeilBrown --- drivers/md/md.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 96336b050b5..11d6e0e1045 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,8 +33,9 @@ */ #include -#include +#include #include +#include #include /* for invalidate_bdev */ #include #include @@ -44,6 +45,9 @@ #include #include #include +#include +#include +#include #include "bitmap.h" #define DEBUG 0 -- cgit v1.2.3-18-g5258 From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move md_k.h from include/linux/raid/ to drivers/md/ It really is nicer to keep related code together.. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 11d6e0e1045..aad0ac54bf9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -45,9 +45,9 @@ #include #include #include -#include #include #include +#include "md.h" #include "bitmap.h" #define DEBUG 0 -- cgit v1.2.3-18-g5258 From 97e4f42d62badb0f9fbc27c013e89bc1336a03bc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: occasionally checkpoint drive recovery to reduce duplicate effort after a crash Version 1.x metadata has the ability to record the status of a partially completed drive recovery. However we only update that record on a clean shutdown. It would be nice to update it on unclean shutdowns too, particularly when using a bitmap that removes much to the 'sync' effort after an unclean shutdown. One complication with checkpointing recovery is that we only know where we are up to in terms of IO requests started, not which ones have completed. And we need to know what has completed to record how much is recovered. So occasionally pause the recovery until all submitted requests are completed, then update the record of where we are up to. When we have a bitmap, we already do that pause occasionally to keep the bitmap up-to-date. So enhance that code to record the recovery offset and schedule a superblock update. And when there is no bitmap, just pause 16 times during the resync to do a checkpoint. '16' is a fairly arbitrary number. But we don't really have any good way to judge how often is acceptable, and it seems like a reasonable number for now. Signed-off-by: NeilBrown --- drivers/md/md.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index aad0ac54bf9..8ea208847a6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1324,10 +1324,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + !test_bit(In_sync, &rdev->flags)) { + if (mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + if (rdev->recovery_offset > 0) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + } } if (mddev->reshape_position != MaxSector) { @@ -6072,6 +6077,18 @@ void md_do_sync(mddev_t *mddev) } if (kthread_should_stop()) goto interrupted; + + if (mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) { + /* time to update curr_resync_completed */ + blk_unplug(mddev->queue); + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = + mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6205,6 +6222,8 @@ static int remove_and_add_spares(mddev_t *mddev) mdk_rdev_t *rdev; int spares = 0; + mddev->curr_resync_completed = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && -- cgit v1.2.3-18-g5258 From 575a80fa4f623141e9791e41879d87800fb6d862 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: be more consistent about setting WriteMostly flag when adding a drive to an array When a drive is added to an array using ADD_NEW_DISK, there are two places we can get certain flags from: the metadata on the disk or the flags passed through the IOCTL. For the WriteMostly flag (aka MD_DISK_WRITEMOSTLY) we take the value from either of those sources depending on if it is set (i.e. we effectively 'or' the two sources together). This makes it awkward to clear, and is at best inconsistent. As documented code (in mdadm) requires that setting MD_DISK_WRITEMOSTLY in the ioctl will be effective, we resolve the inconsistency by always using the value for this flag from the ioctl, and ignoring the value on disk. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ea208847a6..b2c00ce602b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4517,6 +4517,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); -- cgit v1.2.3-18-g5258 From 58c0fed400603a802968b23ddf78f029c5a84e41 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: Make mddev->size sector-based. This patch renames the "size" field of struct mddev_s to "dev_sectors" and stores the number of 512-byte sectors instead of the number of 1K-blocks in it. All users of that field, including raid levels 1,4-6,10, are adjusted accordingly. This simplifies the code a bit because it allows to get rid of a couple of divisions/multiplications by two. In order to make checkpatch happy, some minor coding style issues have also been addressed. In particular, size_store() now uses strict_strtoull() instead of simple_strtoull(). Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/md.c | 97 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 44 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index b2c00ce602b..be4a131e8c0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -816,7 +816,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = sb->size * 2; mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -930,7 +930,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -1028,7 +1028,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) static unsigned long long super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_offset) return 0; /* can't move bitmap */ @@ -1220,7 +1220,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1316,7 +1316,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -1374,7 +1374,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ @@ -1490,8 +1490,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->size exceeds mddev->dev_sectors / 2 */ + if (rdev->size && (mddev->dev_sectors == 0 || + rdev->size < mddev->dev_sectors / 2)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1500,7 +1501,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->size * 2; } /* Verify rdev->desc_nr is unique. @@ -2243,7 +2244,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) size -= rdev->data_offset/2; } } - if (size < my_mddev->size) + if (size < my_mddev->dev_sectors / 2) return -EINVAL; /* component must fit device */ rdev->size = size; @@ -2809,7 +2810,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -3016,7 +3017,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(mddev_t *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } static int update_size(mddev_t *mddev, sector_t num_sectors); @@ -3028,20 +3030,19 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + unsigned long long sectors; + int err = strict_strtoull(buf, 10, §ors); + if (err < 0) + return err; + sectors *= 2; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -3306,15 +3307,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long max_sectors, resync; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + return sprintf(page, "%lu / %lu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); @@ -3789,11 +3790,11 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3875,7 +3876,9 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; @@ -4131,7 +4134,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; - mddev->size = 0; + mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; @@ -4337,8 +4340,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4788,7 +4791,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4926,12 +4929,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1< 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1< 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout @@ -4943,7 +4952,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) else return mddev->pers->reconfig(mddev, info->layout, -1); } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -5443,7 +5452,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_blocks = mddev->resync_max_sectors >> 1; else - max_blocks = mddev->size; + max_blocks = mddev->dev_sectors / 2; /* * Should not happen. @@ -6019,10 +6028,10 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && -- cgit v1.2.3-18-g5258 From dd8ac336c13fd8afdb082ebacb1cddd5cf727889 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: Represent raid device size in sectors. This patch renames the "size" field of struct mdk_rdev_s to "sectors" and changes this field to store sectors instead of blocks. All users of this field, linear.c, raid0.c and md.c, are fixed up accordingly which gets rid of many multiplications and divisions. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/md.c | 96 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index be4a131e8c0..07ab6790e29 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -413,7 +413,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; } } @@ -779,9 +779,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -1184,16 +1184,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + le64_to_cpu(sb->data_offset); else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) + rdev->sectors = rdev->sb_start; + if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; + rdev->sectors = le64_to_cpu(sb->data_size); if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; } @@ -1390,7 +1391,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) sector_t sb_start; sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; @@ -1490,9 +1491,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->dev_sectors / 2 */ - if (rdev->size && (mddev->dev_sectors == 0 || - rdev->size < mddev->dev_sectors / 2)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1501,7 +1502,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->dev_sectors = rdev->size * 2; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1757,8 +1758,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) static void print_rdev(mdk_rdev_t *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { @@ -2197,7 +2198,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; @@ -2211,7 +2212,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2227,31 +2228,31 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { - unsigned long long size; - unsigned long long oldsize = rdev->size; mddev_t *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + unsigned long long sectors; - if (strict_strtoull(buf, 10, &size) < 0) + if (strict_strtoull(buf, 10, §ors) < 0) return -EINVAL; + sectors *= 2; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } + } else if (!sectors) + sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->data_offset; } - if (size < my_mddev->dev_sectors / 2) + if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ mddev_t *mddev; @@ -2267,9 +2268,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (test_bit(AllReserved, &rdev2->flags) || (rdev->bdev == rdev2->bdev && rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, + overlaps(rdev->data_offset, rdev->sectors, rdev2->data_offset, - rdev2->size * 2))) { + rdev2->sectors))) { overlap = 1; break; } @@ -2283,11 +2284,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -3760,13 +3761,13 @@ static int do_md_run(mddev_t * mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (test_bit(Faulty, &rdev->flags)) continue; - if (rdev->size < chunk_size / 1024) { + if (rdev->sectors < chunk_size / 512) { printk(KERN_WARNING "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", + " %llu < %d\n", bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); + (unsigned long long)rdev->sectors, + chunk_size / 512); return -EINVAL; } } @@ -4585,7 +4586,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4655,7 +4656,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4856,8 +4857,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ return -EBUSY; list_for_each_entry(rdev, &mddev->disks, same_set) { - sector_t avail; - avail = rdev->size * 2; + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -5585,7 +5585,7 @@ struct mdstat_info { static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; - sector_t size; + sector_t sectors; mdk_rdev_t *rdev; struct mdstat_info *mi = seq->private; struct bitmap *bitmap; @@ -5621,7 +5621,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; + sectors = 0; list_for_each_entry(rdev, &mddev->disks, same_set) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", @@ -5633,7 +5633,7 @@ static int md_seq_show(struct seq_file *seq, void *v) continue; } else if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5643,7 +5643,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || -- cgit v1.2.3-18-g5258 From 34817e8c3948ea20316dfa8fd8947d6d0ee82ba9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:38 +1100 Subject: md: make sure new_level, new_chunksize, new_layout always have sensible values. When an md array is undergoing a change, we have new_* fields that show the new values. When no change is happening, it is least confusing if these have the same value as the normal fields. This is true in most cases, but not when the values are set via sysfs. So fix this up. A subsequent patch will BUG_ON if these things aren't consistent. Signed-off-by: NeilBrown --- drivers/md/md.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 07ab6790e29..117ea5fde56 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2633,9 +2633,9 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - if (mddev->reshape_position != MaxSector) - mddev->new_layout = n; - else + + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) mddev->layout = n; return len; } @@ -2702,9 +2702,9 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - else if (mddev->reshape_position != MaxSector) - mddev->new_chunk = n; - else + + mddev->new_chunk = n; + if (mddev->reshape_position == MaxSector) mddev->chunk_size = n; return len; } @@ -3831,7 +3831,10 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (pers->level >= 4 && pers->level <= 6) -- cgit v1.2.3-18-g5258 From e0cf8f045b2023b0b3f919ee93eb94345f648434 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: md_unregister_thread should cope with being passed NULL Mostly md_unregister_thread is only called when we know that the thread is NULL, but sometimes we need to check first. It is safer to put the check inside md_unregister_thread itself. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 117ea5fde56..f30f09cb08e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5382,6 +5382,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, void md_unregister_thread(mdk_thread_t *thread) { + if (!thread) + return; dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); -- cgit v1.2.3-18-g5258 From 409c57f3801701dfee27a28103dda4831306cb20 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: enable suspend/resume of md devices. To be able to change the 'level' of an md/raid array, we need to suspend the device so that no requests are active - then move some pointers around etc. The code already keeps counts of active requests and the ->quiesce function can be used to wait until those counts hit zero. However the quiesce function blocks new requests once they are all ready 'inside' the personality module, and that is too late if we want to replace the personality modules. So make all md requests come in through a common md_make_request function that keeps track of how many requests have entered the modules but may not yet be on the internal reference counts. Allow md_make_request to be blocked when we want to suspend the device, and make it possible to wait for all those in-transit requests to be added to internal lists so that ->quiesce can wait for them. There is still a problem that when a request completes, we drop the ref count inside the personality code so there is a short time between when the refcount hits zero, and when the personality code is no longer being used. The personality code never blocks (schedule or spinlock) between dropping the refcount and exiting the routine, so this should be safe (as put_module calls synchronize_sched() before unmapping the module code). Signed-off-by: NeilBrown --- drivers/md/md.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 15 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index f30f09cb08e..6cb31f8da14 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -201,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ) -static int md_fail_request(struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static int md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + mddev_t *mddev = q->queuedata; + int rv; + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return 0; + } + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); + + return rv; } +static void mddev_suspend(mddev_t *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + /* we now know that no code is executing in the personality module, + * except possibly the tail end of a ->bi_end_io function, but that + * is certain to complete before the module has a chance to get + * unloaded + */ +} + +static void mddev_resume(mddev_t *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); +} + + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -314,6 +370,7 @@ static mddev_t * mddev_find(dev_t unit) init_timer(&new->safemode_timer); atomic_set(&new->active, 1); atomic_set(&new->openers, 0); + atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); @@ -3632,10 +3689,12 @@ static int md_alloc(dev_t dev, char *name) mddev_put(mddev); return -ENOMEM; } + mddev->queue->queuedata = mddev; + /* Can be unlocked because the queue is new: no concurrency */ queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_fail_request); + blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); if (!disk) { @@ -3938,16 +3997,6 @@ static int do_md_run(mddev_t * mddev) set_capacity(disk, mddev->array_sectors); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; - /* If there is a partially-recovered drive we need to * start recovery here. If we leave it to md_check_recovery, * it will remove the drives and not do the right thing @@ -4077,7 +4126,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); + mddev->pers->stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; -- cgit v1.2.3-18-g5258 From 245f46c2c221ef09c7db892f0e3fc2149be42052 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:39:39 +1100 Subject: md: add ->takeover method to support changing the personality managing an array Implement this for RAID6 to be able to 'takeover' a RAID5 array. The new RAID6 will use a layout which places Q on the last device, and that device will be missing. If there are any available spares, one will immediately have Q recovered onto it. Signed-off-by: NeilBrown --- drivers/md/md.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 9 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 6cb31f8da14..05b613b5e4b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2649,18 +2649,101 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { + char level[16]; ssize_t rv = len; - if (mddev->pers) + struct mdk_personality *pers; + void *priv; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(level)) + return -EINVAL; + strncpy(level, buf, len); + if (level[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + level[len] = 0; + + request_module("md-%s", level); + spin_lock(&pers_lock); + pers = find_pers(LEVEL_NONE, level); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", level); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), level); + return -EINVAL; + } + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), level); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_size = mddev->new_chunk; + mddev->delta_disks = 0; + pers->run(mddev); + mddev_resume(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return rv; } -- cgit v1.2.3-18-g5258 From b3546035277847028df650b147469fc943cf5c71 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:56:41 +1100 Subject: md/raid5: allow layout/chunksize to be changed on an active 2-drive raid5. 2-drive raid5's aren't very interesting. But if you are converting a raid1 into a raid5, you will at least temporarily have one. And that it a good time to set the layout/chunksize for the new RAID5 if you aren't happy with the defaults. layout and chunksize don't actually affect the placement of data on a 2-drive raid5, so we just do some internal book-keeping. Signed-off-by: NeilBrown --- drivers/md/md.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 05b613b5e4b..0689d89d263 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2771,12 +2771,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - - mddev->new_layout = n; - if (mddev->reshape_position == MaxSector) - mddev->layout = n; + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, n, -1); + if (err) + return err; + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2833,19 +2839,24 @@ chunk_size_show(mddev_t *mddev, char *page) static ssize_t chunk_size_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - - mddev->new_chunk = n; - if (mddev->reshape_position == MaxSector) - mddev->chunk_size = n; + if (mddev->pers) { + int err; + if (mddev->pers->reconfig == NULL) + return -EBUSY; + err = mddev->pers->reconfig(mddev, -1, n); + if (err) + return err; + } else { + mddev->new_chunk = n; + if (mddev->reshape_position == MaxSector) + mddev->chunk_size = n; + } return len; } static struct md_sysfs_entry md_chunk_size = -- cgit v1.2.3-18-g5258 From 1f403624bde3c678a166984b1e6a727a0ce06f2b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 14:59:03 +1100 Subject: md: centralize ->array_sectors modifications Get personalities out of the business of directly modifying ->array_sectors. Lays groundwork to introduce policy on when ->array_sectors can be modified. Reviewed-by: Andre Noll Signed-off-by: Dan Williams --- drivers/md/md.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0689d89d263..76ba69b31d6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4977,6 +4977,12 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) +{ + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; -- cgit v1.2.3-18-g5258 From b522adcde9c4d3fb7b579cfa9160d8bde7744be8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:00:31 +1100 Subject: md: 'array_size' sysfs attribute Allow userspace to set the size of the array according to the following semantics: 1/ size must be <= to the size returned by mddev->pers->size(mddev, 0, 0) a) If size is set before the array is running, do_md_run will fail if size is greater than the default size b) A reshape attempt that reduces the default size to less than the set array size should be blocked 2/ once userspace sets the size the kernel will not change it 3/ writing 'default' to this attribute returns control of the size to the kernel and reverts to the size reported by the personality Also, convert locations that need to know the default size from directly reading ->array_sectors to _size. Resync/reshape operations always follow the default size. Finally, fixup other locations that read a number of 1k-blocks from userspace to use strict_blocks_to_sectors() which checks for unsigned long long to sector_t overflow and blocks to sectors overflow. Reviewed-by: Andre Noll Signed-off-by: Dan Williams --- drivers/md/md.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 76ba69b31d6..923d1250b9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -387,6 +387,11 @@ static inline int mddev_lock(mddev_t * mddev) return mutex_lock_interruptible(&mddev->reconfig_mutex); } +static inline int mddev_is_locked(mddev_t *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + static inline int mddev_trylock(mddev_t * mddev) { return mutex_trylock(&mddev->reconfig_mutex); @@ -2282,16 +2287,34 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (strict_strtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { mddev_t *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; - unsigned long long sectors; + sector_t sectors; - if (strict_strtoull(buf, 10, §ors) < 0) + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; - sectors *= 2; if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. @@ -3182,12 +3205,11 @@ size_store(mddev_t *mddev, const char *buf, size_t len) * not increase it (except from 0). * If array is active, we can try an on-line resize */ - unsigned long long sectors; - int err = strict_strtoull(buf, 10, §ors); + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); if (err < 0) return err; - sectors *= 2; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3627,6 +3649,57 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +array_size_show(mddev_t *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(mddev_t *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -EINVAL; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); + if (mddev->pers) { + struct block_device *bdev = bdget_disk(mddev->gendisk, 0); + + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, + (loff_t)mddev->array_sectors << 9); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3640,6 +3713,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_array_size.attr, NULL, }; @@ -4045,7 +4119,17 @@ static int do_md_run(mddev_t * mddev) err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -4281,6 +4365,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) export_array(mddev); mddev->array_sectors = 0; + mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; @@ -4979,10 +5064,23 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) { + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + mddev->array_sectors = array_sectors; } EXPORT_SYMBOL(md_set_array_sectors); +void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors) +{ + mddev_lock(mddev); + md_set_array_sectors(mddev, array_sectors); + mddev_unlock(mddev); +} +EXPORT_SYMBOL(md_set_array_sectors_lock); + static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; -- cgit v1.2.3-18-g5258 From cea9c22800773cecb1d41f4a6139f9eb6a95368b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:15:05 +1100 Subject: md: add explicit method to signal the end of a reshape. Currently raid5 (the only module that supports restriping) notices that the reshape has finished be sync_request being given a large value, and handles any cleanup them. This patch changes it so md_check_recovery calls into an explicit finish_reshape method as well. The clean-up from sync_request can do things that need to be done promptly, typically things local to the raid5_conf_t structure. The "finish_reshape" method is called under the mddev_lock so it can do things involving reconfiguring the device. This allows us to get rid of md_set_array_sectors_locked, which would have caused a deadlock if you tried to stop and array while a reshape was happening. Signed-off-by: NeilBrown --- drivers/md/md.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 923d1250b9a..c50931352b2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5073,14 +5073,6 @@ void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) } EXPORT_SYMBOL(md_set_array_sectors); -void md_set_array_sectors_lock(mddev_t *mddev, sector_t array_sectors) -{ - mddev_lock(mddev); - md_set_array_sectors(mddev, array_sectors); - mddev_unlock(mddev); -} -EXPORT_SYMBOL(md_set_array_sectors_lock); - static int update_size(mddev_t *mddev, sector_t num_sectors) { mdk_rdev_t *rdev; @@ -6641,6 +6633,9 @@ void md_check_recovery(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk -- cgit v1.2.3-18-g5258 From d1a7c50369835f9ecbd7752016cd9302ecfae678 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:24:32 +1100 Subject: md: don't display meaningless values in sysfs files resync_start and sync_speed When no resync if happening, both of these files currently have meaningless values (is slightly different ways). Change them to "none" in that case. Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c50931352b2..2be574c0a27 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2888,6 +2888,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(mddev_t *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } @@ -3469,6 +3471,8 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; -- cgit v1.2.3-18-g5258