diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 4 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 43 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 2 | ||||
-rw-r--r-- | drivers/md/faulty.c | 9 | ||||
-rw-r--r-- | drivers/md/linear.c | 36 | ||||
-rw-r--r-- | drivers/md/md.c | 537 | ||||
-rw-r--r-- | drivers/md/md.h | 16 | ||||
-rw-r--r-- | drivers/md/multipath.c | 13 | ||||
-rw-r--r-- | drivers/md/raid0.c | 251 | ||||
-rw-r--r-- | drivers/md/raid0.h | 3 | ||||
-rw-r--r-- | drivers/md/raid1.c | 114 | ||||
-rw-r--r-- | drivers/md/raid10.c | 300 | ||||
-rw-r--r-- | drivers/md/raid10.h | 12 | ||||
-rw-r--r-- | drivers/md/raid5.c | 233 |
14 files changed, 989 insertions, 584 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index acb3a4e404f..4a6feac8c94 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -100,8 +100,8 @@ config MD_RAID1 If unsure, say Y. config MD_RAID10 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" - depends on BLK_DEV_MD && EXPERIMENTAL + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD ---help--- RAID-10 provides a combination of striping (RAID-0) and mirroring (RAID-1) with easier configuration and more flexible diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index f084249295d..1742435ce3a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -505,7 +505,7 @@ void bitmap_update_sb(struct bitmap *bitmap) return; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) { /* rocking back to read-only */ @@ -526,7 +526,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -575,7 +575,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -661,7 +661,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, return 0; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); old = le32_to_cpu(sb->state) & bits; switch (op) { case MASK_SET: sb->state |= cpu_to_le32(bits); @@ -1292,9 +1292,14 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect if (!bitmap) return 0; if (behind) { + int bw; atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + bw, bitmap->max_write_behind); } while (sectors) { @@ -1351,7 +1356,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto { if (!bitmap) return; if (behind) { - atomic_dec(&bitmap->behind_writes); + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } @@ -1675,6 +1681,7 @@ int bitmap_create(mddev_t *mddev) atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; @@ -1692,7 +1699,7 @@ int bitmap_create(mddev_t *mddev) * and bypass the page cache, we must sync the file * first. */ - vfs_fsync(file, file->f_dentry, 1); + vfs_fsync(file, 1); } /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ if (!mddev->bitmap_info.external) @@ -2006,6 +2013,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry bitmap_can_clear = __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); +static ssize_t +behind_writes_used_show(mddev_t *mddev, char *page) +{ + if (mddev->bitmap == NULL) + return sprintf(page, "0\n"); + return sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, &bitmap_timeout.attr, @@ -2013,6 +2041,7 @@ static struct attribute *md_bitmap_attrs[] = { &bitmap_chunksize.attr, &bitmap_metadata.attr, &bitmap_can_clear.attr, + &max_backlog_used.attr, NULL }; struct attribute_group md_bitmap_group = { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index cb821d76d1b..3797dea4723 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -227,6 +227,7 @@ struct bitmap { int allclean; atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ /* * the bitmap daemon - periodically wakes up and sweeps the bitmap @@ -239,6 +240,7 @@ struct bitmap { atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; }; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 8e3850b98cc..1a898788461 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -169,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode) conf->nfaults = n+1; } -static int make_request(struct request_queue *q, struct bio *bio) +static int make_request(mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int failit = 0; if (bio_data_dir(bio) == WRITE) { @@ -225,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int n; if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) @@ -328,7 +327,7 @@ static int run(mddev_t *mddev) static int stop(mddev_t *mddev) { - conf_t *conf = (conf_t *)mddev->private; + conf_t *conf = mddev->private; kfree(conf); mddev->private = NULL; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09437e95823..7e0e057db9a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -159,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { - printk("linear: disk numbering problem. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); goto out; } @@ -187,7 +188,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) } if (cnt != raid_disks) { - printk("linear: not enough drives present. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); goto out; } @@ -282,29 +284,21 @@ static int linear_stop (mddev_t *mddev) rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); + mddev->private = NULL; return 0; } -static int linear_make_request (struct request_queue *q, struct bio *bio) +static int linear_make_request (mddev_t *mddev, struct bio *bio) { - const int rw = bio_data_dir(bio); - mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; sector_t start_sector; - int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { md_barrier_request(mddev, bio); return 0; } - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - rcu_read_lock(); tmp_dev = which_dev(mddev, bio->bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; @@ -314,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; - printk("linear_make_request: Sector %llu out of bounds on " - "dev %s: %llu sectors, offset %llu\n", - (unsigned long long)bio->bi_sector, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); + printk(KERN_ERR + "md/linear:%s: make_request: Sector %llu out of bounds on " + "dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); rcu_read_unlock(); bio_io_error(bio); return 0; @@ -336,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bp = bio_split(bio, end_sector - bio->bi_sector); - if (linear_make_request(q, &bp->bio1)) + if (linear_make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (linear_make_request(q, &bp->bio2)) + if (linear_make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index a9fd491796a..46b3a044ead 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -215,8 +215,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ static int md_make_request(struct request_queue *q, struct bio *bio) { + const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; int rv; + int cpu; + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return 0; @@ -237,13 +240,27 @@ static int md_make_request(struct request_queue *q, struct bio *bio) } atomic_inc(&mddev->active_io); rcu_read_unlock(); - rv = mddev->pers->make_request(q, bio); + + rv = mddev->pers->make_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); return rv; } +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ static void mddev_suspend(mddev_t *mddev) { BUG_ON(mddev->suspended); @@ -251,13 +268,6 @@ static void mddev_suspend(mddev_t *mddev) synchronize_rcu(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - /* we now know that no code is executing in the personality module, - * except possibly the tail end of a ->bi_end_io function, but that - * is certain to complete before the module has a chance to get - * unloaded - */ } static void mddev_resume(mddev_t *mddev) @@ -344,7 +354,7 @@ static void md_submit_barrier(struct work_struct *ws) bio_endio(bio, 0); else { bio->bi_rw &= ~(1<<BIO_RW_BARRIER); - if (mddev->pers->make_request(mddev->queue, bio)) + if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); mddev->barrier = POST_REQUEST_BARRIER; submit_barriers(mddev); @@ -406,6 +416,27 @@ static void mddev_put(mddev_t *mddev) spin_unlock(&all_mddevs_lock); } +static void mddev_init(mddev_t *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; +} + static mddev_t * mddev_find(dev_t unit) { mddev_t *mddev, *new = NULL; @@ -472,23 +503,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - mutex_init(&new->open_mutex); - mutex_init(&new->reconfig_mutex); - mutex_init(&new->bitmap_info.mutex); - INIT_LIST_HEAD(&new->disks); - INIT_LIST_HEAD(&new->all_mddevs); - init_timer(&new->safemode_timer); - atomic_set(&new->active, 1); - atomic_set(&new->openers, 0); - atomic_set(&new->active_io, 0); - spin_lock_init(&new->write_lock); - atomic_set(&new->flush_pending, 0); - init_waitqueue_head(&new->sb_wait); - init_waitqueue_head(&new->recovery_wait); - new->reshape_position = MaxSector; - new->resync_min = 0; - new->resync_max = MaxSector; - new->level = LEVEL_NONE; + mddev_init(new); goto retry; } @@ -508,9 +523,36 @@ static inline int mddev_trylock(mddev_t * mddev) return mutex_trylock(&mddev->reconfig_mutex); } -static inline void mddev_unlock(mddev_t * mddev) +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(mddev_t * mddev) { - mutex_unlock(&mddev->reconfig_mutex); + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold open_mutex instead - we are allowed to take + * it while holding reconfig_mutex, and md_run can + * use it to wait for the remove to complete. + */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mutex_lock(&mddev->open_mutex); + mutex_unlock(&mddev->reconfig_mutex); + + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } + mutex_unlock(&mddev->open_mutex); + } else + mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); } @@ -1029,10 +1071,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->bitmap_info.default_offset; } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling */ + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (sb->disks[rdev->desc_nr].state & ( + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. @@ -1428,10 +1473,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling */ + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. @@ -2047,7 +2096,6 @@ static void sync_sbs(mddev_t * mddev, int nospares) if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && - (rdev->sb_events&1)==0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; @@ -2100,28 +2148,14 @@ repeat: * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && (mddev->events & 1) - && mddev->events != 1) + && mddev->can_decrease_events + && mddev->events != 1) { mddev->events--; - else { + mddev->can_decrease_events = 0; + } else { /* otherwise we have to go forward and ... */ mddev->events ++; - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, an 'even' event must also go - * to spares. */ - if ((mddev->events&1)==0) { - nospares = 0; - sync_req = 2; /* force a second update to get the - * even/odd in sync */ - } - } else { - /* otherwise an 'odd' event must go to spares */ - if ((mddev->events&1)) { - nospares = 0; - sync_req = 2; /* force a second update to get the - * even/odd in sync */ - } - } + mddev->can_decrease_events = nospares; } if (!mddev->events) { @@ -2365,6 +2399,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&rdev->mddev->kobj, nm); + rdev->raid_disk = -1; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { @@ -2780,8 +2815,9 @@ static void analyze_sbs(mddev_t * mddev) i = 0; rdev_for_each(rdev, tmp, mddev) { - if (rdev->desc_nr >= mddev->max_disks || - i > mddev->max_disks) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { printk(KERN_WARNING "md: %s: %s: only %d devices permitted\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -2897,9 +2933,10 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - char level[16]; + char clevel[16]; ssize_t rv = len; struct mdk_personality *pers; + long level; void *priv; mdk_rdev_t *rdev; @@ -2932,19 +2969,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } /* Now find the new personality */ - if (len == 0 || len >= sizeof(level)) + if (len == 0 || len >= sizeof(clevel)) return -EINVAL; - strncpy(level, buf, len); - if (level[len-1] == '\n') + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') len--; - level[len] = 0; + clevel[len] = 0; + if (strict_strtol(clevel, 10, &level)) + level = LEVEL_NONE; - request_module("md-%s", level); + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); spin_lock(&pers_lock); - pers = find_pers(LEVEL_NONE, level); + pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", level); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); return -EINVAL; } spin_unlock(&pers_lock); @@ -2957,7 +2997,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), level); + mdname(mddev), clevel); return -EINVAL; } @@ -2973,13 +3013,44 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), level); + mdname(mddev), clevel); return PTR_ERR(priv); } /* Looks like we have a winner */ mddev_suspend(mddev); mddev->pers->stop(mddev); + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + module_put(mddev->pers->owner); /* Invalidate devices that are now superfluous */ list_for_each_entry(rdev, &mddev->disks, same_set) @@ -2994,11 +3065,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } pers->run(mddev); mddev_resume(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); return rv; } @@ -3237,6 +3317,7 @@ array_state_show(mddev_t *mddev, char *page) } static int do_md_stop(mddev_t * mddev, int ro, int is_open); +static int md_set_readonly(mddev_t * mddev, int is_open); static int do_md_run(mddev_t * mddev); static int restart_array(mddev_t *mddev); @@ -3267,7 +3348,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -3277,7 +3358,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro == 0) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { @@ -4082,15 +4163,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { mddev_t *mddev = container_of(ws, mddev_t, del_work); - if (mddev->private) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->private != (void*)1) - sysfs_remove_group(&mddev->kobj, mddev->private); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - mddev->private = NULL; - } sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); @@ -4234,11 +4306,10 @@ static void md_safemode_timeout(unsigned long data) static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +static int md_run(mddev_t *mddev) { int err; mdk_rdev_t *rdev; - struct gendisk *disk; struct mdk_personality *pers; if (list_empty(&mddev->disks)) @@ -4248,6 +4319,13 @@ static int do_md_run(mddev_t * mddev) if (mddev->pers) return -EBUSY; + /* These two calls synchronise us with the + * sysfs_remove_group calls in mddev_unlock, + * so they must have completed. + */ + mutex_lock(&mddev->open_mutex); + mutex_unlock(&mddev->open_mutex); + /* * Analyze all RAID superblock(s) */ @@ -4296,8 +4374,6 @@ static int do_md_run(mddev_t * mddev) sysfs_notify_dirent(rdev->sysfs_state); } - disk = mddev->gendisk; - spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -4425,22 +4501,32 @@ static int do_md_run(mddev_t * mddev) if (mddev->flags) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_sectors); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - revalidate_disk(mddev->gendisk); - mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); if (mddev->sysfs_action) sysfs_notify_dirent(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); return 0; } +static int do_md_run(mddev_t *mddev) +{ + int err; + + err = md_run(mddev); + if (err) + goto out; + + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; +} + static int restart_array(mddev_t *mddev) { struct gendisk *disk = mddev->gendisk; @@ -4491,9 +4577,110 @@ void restore_bitmap_write_access(struct file *file) spin_unlock(&inode->i_lock); } +static void md_clean(mddev_t *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->degraded = 0; + mddev->barriers_work = 0; + mddev->safemode = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; +} + +static void md_stop_writes(mddev_t *mddev) +{ + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + bitmap_flush(mddev); + md_super_wait(mddev); + + if (!mddev->in_sync || mddev->flags) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } +} + +static void md_stop(mddev_t *mddev) +{ + md_stop_writes(mddev); + + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} + +static int md_set_readonly(mddev_t *mddev, int is_open) +{ + int err = 0; + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > is_open) { + printk("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } + if (mddev->pers) { + md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; +} + /* mode: * 0 - completely stop and dis-assemble array - * 1 - switch to readonly * 2 - stop but do not disassemble array */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) @@ -4508,64 +4695,32 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) err = -EBUSY; } else if (mddev->pers) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - } - - del_timer_sync(&mddev->safemode_timer); + if (mddev->ro) + set_disk_ro(disk, 0); - switch(mode) { - case 1: /* readonly */ - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - break; - case 0: /* disassemble */ - case 2: /* stop */ - bitmap_flush(mddev); - md_super_wait(mddev); - if (mddev->ro) - set_disk_ro(disk, 0); + md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->unplug_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; - mddev->pers->stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - module_put(mddev->pers->owner); - if (mddev->pers->sync_request && mddev->private == NULL) - mddev->private = (void*)1; - mddev->pers = NULL; - /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent(mddev->sysfs_state); + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } - set_capacity(disk, 0); - mddev->changed = 1; + set_capacity(disk, 0); + revalidate_disk(disk); - if (mddev->ro) - mddev->ro = 0; - } - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } - if (mode == 1) - set_disk_ro(disk, 1); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->ro) + mddev->ro = 0; + err = 0; } -out: mutex_unlock(&mddev->open_mutex); if (err) return err; @@ -4586,52 +4741,12 @@ out: export_array(mddev); - mddev->array_sectors = 0; - mddev->external_size = 0; - mddev->dev_sectors = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; |