diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 194 |
1 files changed, 134 insertions, 60 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 16d84e091e2..32fc19c540d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1075,6 +1075,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1153,6 +1154,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1168,6 +1171,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where @@ -1561,6 +1565,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1643,6 +1648,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1663,10 +1670,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(Faulty, &rdev->flags); break; default: + rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) + MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; @@ -1728,6 +1739,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= @@ -2469,8 +2483,7 @@ repeat: if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (!test_bit(Faulty, &rdev->flags) && - rdev->saved_raid_disk == -1) { + if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); @@ -2486,11 +2499,9 @@ repeat: rdev->badblocks.size = 0; } - } else if (test_bit(Faulty, &rdev->flags)) + } else pr_debug("md: %s (skipping faulty)\n", bdevname(rdev->bdev, b)); - else - pr_debug("(skipping incremental s/r "); if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ @@ -2606,6 +2617,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen */ @@ -2654,6 +2667,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; @@ -2786,6 +2804,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { @@ -3429,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->level = LEVEL_NONE; return rv; } + if (mddev->ro) + return -EROFS; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3580,6 +3601,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); return rv; @@ -3613,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) { @@ -3702,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) { @@ -5160,32 +5187,6 @@ static int restart_array(struct mddev *mddev) return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - -void restore_bitmap_write_access(struct file *file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); -} - static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; @@ -5406,7 +5407,6 @@ static int do_md_stop(struct mddev * mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; } @@ -5599,7 +5599,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); + info.state |= (1<<MD_SB_BITMAP_PRESENT); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -5758,8 +5758,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); @@ -5772,11 +5774,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) return -EINVAL; } - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = rdev->raid_disk; - else - rdev->saved_raid_disk = -1; - clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); @@ -5961,7 +5958,7 @@ abort_export: static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce) @@ -5973,6 +5970,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { + struct inode *inode; if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ mddev->bitmap_info.file = fget(fd); @@ -5983,10 +5981,21 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_info.file); - if (err) { + inode = mddev->bitmap_info.file->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + printk(KERN_ERR "%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); + err = -EBUSY; + } + if (err) { fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; return err; @@ -6009,10 +6018,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); + if (mddev->bitmap_info.file) fput(mddev->bitmap_info.file); - } mddev->bitmap_info.file = NULL; } @@ -6134,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; + if (mddev->ro) + return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; @@ -6156,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; @@ -6326,6 +6337,32 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case BLKROSET: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case PRINT_RAID_DEBUG: + case RAID_AUTORUN: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + return true; + default: + return false; + } +} + static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -6334,6 +6371,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, struct mddev *mddev = NULL; int ro; + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + switch (cmd) { case RAID_VERSION: case GET_ARRAY_INFO: @@ -7135,11 +7175,14 @@ static int md_seq_open(struct inode *inode, struct file *file) return error; } +static int md_unloading; static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; int mask; + if (md_unloading) + return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ @@ -7348,8 +7391,10 @@ void md_do_sync(struct md_thread *thread) /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { @@ -7456,6 +7501,19 @@ void md_do_sync(struct md_thread *thread) rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -7704,10 +7762,12 @@ static int remove_and_add_spares(struct mddev *mddev, if (test_bit(Faulty, &rdev->flags)) continue; if (mddev->ro && - rdev->saved_raid_disk < 0) + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) continue; - rdev->recovery_offset = 0; + if (rdev->saved_raid_disk < 0) + rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) @@ -7785,9 +7845,13 @@ void md_check_recovery(struct mddev *mddev) * As we only add devices that are already in-sync, * we can activate the spares immediately. */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); remove_and_add_spares(mddev, NULL); - mddev->pers->spare_active(mddev); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -7924,14 +7988,10 @@ void md_reap_sync_thread(struct mddev *mddev) mddev->pers->finish_reshape(mddev); /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. + * information must be scrapped. */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) + if (!mddev->degraded) + rdev_for_each(rdev, mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); @@ -8296,7 +8356,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) if (a < s) { /* we need to split this range */ if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; + rv = -ENOSPC; goto out; } memmove(p+lo+1, p+lo, (bb->count - lo) * 8); @@ -8482,7 +8542,8 @@ static int md_notify_reboot(struct notifier_block *this, if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); - mddev->safemode = 2; + if (mddev->persistent) + mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; @@ -8624,6 +8685,7 @@ static __exit void md_exit(void) { struct mddev *mddev; struct list_head *tmp; + int delay = 1; blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); @@ -8632,7 +8694,19 @@ static __exit void md_exit(void) unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { export_array(mddev); mddev->hold_active = 0; |
