diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 5552 |
1 files changed, 3933 insertions, 1619 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index deeac4b4417..32fc19c540d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -32,53 +32,58 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/module.h> -#include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/linkage.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> #include <linux/sysctl.h> -#include <linux/buffer_head.h> /* for invalidate_bdev */ +#include <linux/seq_file.h> +#include <linux/fs.h> #include <linux/poll.h> -#include <linux/mutex.h> #include <linux/ctype.h> -#include <linux/freezer.h> - -#include <linux/init.h> - +#include <linux/string.h> +#include <linux/hdreg.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/reboot.h> #include <linux/file.h> - -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - -#include <asm/unaligned.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER - -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 - -#define DEBUG 0 -#define dprintk(x...) ((void)(DEBUG && printk(x))) - +#include <linux/compat.h> +#include <linux/delay.h> +#include <linux/raid/md_p.h> +#include <linux/raid/md_u.h> +#include <linux/slab.h> +#include "md.h" +#include "bitmap.h" #ifndef MODULE -static void autostart_arrays (int part); +static void autostart_arrays(int part); #endif +/* pers_list is a list of registered personalities protected + * by pers_lock. + * pers_lock does extra service to protect accesses to + * mddev->thread when the mutex cannot be held. + */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that @@ -93,13 +98,13 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(mddev_t *mddev) +static inline int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(mddev_t *mddev) +static inline int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; @@ -107,52 +112,77 @@ static inline int speed_max(mddev_t *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { - .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { - .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, - { .ctl_name = 0 } + { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, - { .ctl_name = 0 } + { } }; -static struct block_device_operations md_fops; +static const struct block_device_operations md_fops; static int start_readonly; +/* bio_clone_mddev + * like bio_clone, but with a local bio set + */ + +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev) +{ + struct bio *b; + + if (!mddev || !mddev->bio_set) + return bio_alloc(gfp_mask, nr_iovecs); + + b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); + if (!b) + return NULL; + return b; +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + struct mddev *mddev) +{ + if (!mddev || !mddev->bio_set) + return bio_clone(bio, gfp_mask); + + return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); +} +EXPORT_SYMBOL_GPL(bio_clone_mddev); + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -165,7 +195,7 @@ static int start_readonly; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -void md_new_event(mddev_t *mddev) +void md_new_event(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -175,7 +205,7 @@ EXPORT_SYMBOL_GPL(md_new_event); /* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */ -static void md_new_event_inintr(mddev_t *mddev) +static void md_new_event_inintr(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -196,62 +226,321 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define for_each_mddev(mddev,tmp) \ +#define for_each_mddev(_mddev,_tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ - tmp = all_mddevs.next; \ - mddev = NULL;}); \ - ({ if (tmp != &all_mddevs) \ - mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + _tmp = all_mddevs.next; \ + _mddev = NULL;}); \ + ({ if (_tmp != &all_mddevs) \ + mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ spin_unlock(&all_mddevs_lock); \ - if (mddev) mddev_put(mddev); \ - mddev = list_entry(tmp, mddev_t, all_mddevs); \ - tmp != &all_mddevs;}); \ + if (_mddev) mddev_put(_mddev); \ + _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ + _tmp != &all_mddevs;}); \ ({ spin_lock(&all_mddevs_lock); \ - tmp = tmp->next;}) \ + _tmp = _tmp->next;}) \ ) -static int md_fail_request (struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static void md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + const int rw = bio_data_dir(bio); + struct mddev *mddev = q->queuedata; + int cpu; + unsigned int sectors; + + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { + bio_io_error(bio); + return; + } + if (mddev->ro == 1 && unlikely(rw == WRITE)) { + bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + return; + } + smp_rmb(); /* Ensure implications of 'active' are visible */ + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + /* + * save the sectors now since our bio can + * go away inside make_request + */ + sectors = bio_sectors(bio); + mddev->pers->make_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_unlock(); + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} + +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ +void mddev_suspend(struct mddev *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + + del_timer_sync(&mddev->safemode_timer); +} +EXPORT_SYMBOL_GPL(mddev_suspend); + +void mddev_resume(struct mddev *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ +} +EXPORT_SYMBOL_GPL(mddev_resume); + +int mddev_congested(struct mddev *mddev, int bits) +{ + return mddev->suspended; } +EXPORT_SYMBOL(mddev_congested); + +/* + * Generic flush handling for md + */ + +static void md_end_flush(struct bio *bio, int err) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); + } + bio_put(bio); +} + +static void md_submit_flush_data(struct work_struct *ws); -static inline mddev_t *mddev_get(mddev_t *mddev) +static void submit_flushes(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct md_rdev *rdev; + + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* Take two references, one is dropped + * when request finishes, one after + * we reclaim rcu_read_lock + */ + struct bio *bi; + atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bi->bi_end_io = md_end_flush; + bi->bi_private = rdev; + bi->bi_bdev = rdev->bdev; + atomic_inc(&mddev->flush_pending); + submit_bio(WRITE_FLUSH, bi); + rcu_read_lock(); + rdev_dec_pending(rdev, mddev); + } + rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} + +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio, 0); + else { + bio->bi_rw &= ~REQ_FLUSH; + mddev->pers->make_request(mddev, bio); + } + + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); +} + +void md_flush_request(struct mddev *mddev, struct bio *bio) +{ + spin_lock_irq(&mddev->write_lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio, + mddev->write_lock); + mddev->flush_bio = bio; + spin_unlock_irq(&mddev->write_lock); + + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); +} +EXPORT_SYMBOL(md_flush_request); + +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); +} +EXPORT_SYMBOL(md_unplug); + +static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); return mddev; } -static void mddev_put(mddev_t *mddev) +static void mddev_delayed_delete(struct work_struct *ws); + +static void mddev_put(struct mddev *mddev) { + struct bio_set *bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks)) { - list_del(&mddev->all_mddevs); - spin_unlock(&all_mddevs_lock); - blk_cleanup_queue(mddev->queue); - kobject_put(&mddev->kobj); - } else - spin_unlock(&all_mddevs_lock); + if (!mddev->raid_disks && list_empty(&mddev->disks) && + mddev->ctime == 0 && !mddev->hold_active) { + /* Array is not configured at all, and not held active, + * so destroy it */ + list_del_init(&mddev->all_mddevs); + bs = mddev->bio_set; + mddev->bio_set = NULL; + if (mddev->gendisk) { + /* We did a probe so need to clean up. Call + * queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will + * succeed in waiting for the work to be done. + */ + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + queue_work(md_misc_wq, &mddev->del_work); + } else + kfree(mddev); + } + spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); +} + +void mddev_init(struct mddev *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; } +EXPORT_SYMBOL_GPL(mddev_init); -static mddev_t * mddev_find(dev_t unit) +static struct mddev * mddev_find(dev_t unit) { - mddev_t *mddev, *new = NULL; + struct mddev *mddev, *new = NULL; + + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1<<MdpMinorShift)-1); retry: spin_lock(&all_mddevs_lock); - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); + + if (unit) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } + + if (new) { + list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; + new->hold_active = UNTIL_IOCTL; + return new; } - - if (new) { + } else if (new) { + /* find an unused unit number */ + static int next_minor = 512; + int start = next_minor; + int is_free = 0; + int dev = 0; + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) { + /* Oh dear, all in use. */ + spin_unlock(&all_mddevs_lock); + kfree(new); + return NULL; + } + + is_free = 1; + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == dev) { + is_free = 0; + break; + } + } + new->unit = dev; + new->md_minor = MINOR(dev); + new->hold_active = UNTIL_STOP; list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; @@ -268,77 +557,126 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - mutex_init(&new->reconfig_mutex); - INIT_LIST_HEAD(&new->disks); - INIT_LIST_HEAD(&new->all_mddevs); - init_timer(&new->safemode_timer); - atomic_set(&new->active, 1); - atomic_set(&new->openers, 0); - spin_lock_init(&new->write_lock); - init_waitqueue_head(&new->sb_wait); - init_waitqueue_head(&new->recovery_wait); - new->reshape_position = MaxSector; - new->resync_min = 0; - new->resync_max = MaxSector; - new->level = LEVEL_NONE; - - new->queue = blk_alloc_queue(GFP_KERNEL); - if (!new->queue) { - kfree(new); - return NULL; - } - /* Can be unlocked because the queue is new: no concurrency */ - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); - - blk_queue_make_request(new->queue, md_fail_request); + mddev_init(new); goto retry; } -static inline int mddev_lock(mddev_t * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } -static inline int mddev_trylock(mddev_t * mddev) +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + +static inline int mddev_trylock(struct mddev * mddev) { return mutex_trylock(&mddev->reconfig_mutex); } -static inline void mddev_unlock(mddev_t * mddev) +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(struct mddev * mddev) { - mutex_unlock(&mddev->reconfig_mutex); + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. + */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mddev->sysfs_active = 1; + mutex_unlock(&mddev->reconfig_mutex); + + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } + } + mddev->sysfs_active = 0; + } else + mutex_unlock(&mddev->reconfig_mutex); + /* As we've dropped the mutex we need a spinlock to + * make sure the thread doesn't disappear + */ + spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + spin_unlock(&pers_lock); } -static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { - mdk_rdev_t * rdev; - struct list_head *tmp; + struct md_rdev *rdev; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; - } + return NULL; } -static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; - } + return NULL; } -static struct mdk_personality *find_pers(int level, char *clevel) +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) { - struct mdk_personality *pers; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + +static struct md_personality *find_pers(int level, char *clevel) +{ + struct md_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; @@ -349,22 +687,13 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { - sector_t num_sectors = bdev->bd_inode->i_size / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) -{ - sector_t num_sectors = rdev->sb_start; - - if (chunk_size) - num_sectors &= ~((sector_t)chunk_size/512 - 1); - return num_sectors; -} - -static int alloc_disk_sb(mdk_rdev_t * rdev) +static int alloc_disk_sb(struct md_rdev * rdev) { if (rdev->sb_page) MD_BUG(); @@ -378,22 +707,28 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) return 0; } -static void free_disk_sb(mdk_rdev_t * rdev) +void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; + } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; } + kfree(rdev->badblocks.page); + rdev->badblocks.page = NULL; } - +EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio, int error) { - mdk_rdev_t *rdev = bio->bi_private; - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { printk("md: super_written gets error=%d, uptodate=%d\n", @@ -407,32 +742,7 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - -void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, +void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page) { /* write first size bytes of page to sector of rdev @@ -440,78 +750,50 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if BIO_RW_BARRIER - * causes ENOTSUPP, we allocate a spare bio... */ - struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; + bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= (1<<BIO_RW_BARRIER); - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(WRITE_FLUSH_FUA, bio); } -void md_super_wait(mddev_t *mddev) +void md_super_wait(struct mddev *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - -int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw) +int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op) { - struct bio *bio = bio_alloc(GFP_NOIO, 1); - struct completion event; + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); int ret; - rw |= (1 << BIO_RW_SYNC); - - bio->bi_bdev = bdev; - bio->bi_sector = sector; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_iter.bi_sector = sector + rdev->sb_start; + else if (rdev->mddev->reshape_position != MaxSector && + (rdev->mddev->reshape_backwards == + (sector >= rdev->mddev->reshape_position))) + bio->bi_iter.bi_sector = sector + rdev->new_data_offset; + else + bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); @@ -519,7 +801,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, } EXPORT_SYMBOL_GPL(sync_page_io); -static int read_disk_sb(mdk_rdev_t * rdev, int size) +static int read_disk_sb(struct md_rdev * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -530,7 +812,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -626,7 +908,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: - * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: @@ -636,13 +918,13 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * - * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * - * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * @@ -651,18 +933,41 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) struct super_type { char *name; struct module *owner; - int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, int minor_version); - int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); - void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); - unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); + unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); }; /* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(struct mddev *mddev) +{ + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* * load_super for 0.90.0 */ -static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; @@ -674,7 +979,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -682,7 +987,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = -EINVAL; bdevname(rdev->bdev, b); - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -710,18 +1015,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; - - if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { - if (sb->level != 1 && sb->level != 4 - && sb->level != 5 && sb->level != 6 - && sb->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - goto abort; - } - } + rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -732,7 +1028,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 1; } else { __u64 ev1, ev2; - mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + mdp_super_t *refsb = page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { printk(KERN_WARNING "md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); @@ -751,9 +1047,15 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * (not needed for Linear and RAID0 as metadata doesn't + * record this size) + */ + if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (2ULL << 32) - 2; - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -764,47 +1066,53 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version /* * validate_super for 0.90.0 */ -static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) { mdp_disk_t *desc; - mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; - mddev->chunk_size = sb->chunk_size; + mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* bitmap can use 60 K after the 4K superblocks */ + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; - mddev->new_chunk = sb->new_chunk; + mddev->new_chunk_sectors = sb->new_chunk >> 9; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } if (sb->state & (1<<MD_SB_CLEAN)) @@ -825,20 +1133,29 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_file == NULL) - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; + } } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling */ + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (sb->disks[rdev->desc_nr].state & ( + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -854,6 +1171,15 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); @@ -865,11 +1191,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* * sync_super for 0.90.0 */ -static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; - struct list_head *tmp; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int next_spare = mddev->raid_disks; @@ -888,7 +1213,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) rdev->sb_size = MD_SB_BYTES; - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -903,7 +1228,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -920,7 +1245,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk; + sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) @@ -934,17 +1259,28 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->recovery_cp = 0; sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_size; + sb->chunk_size = mddev->chunk_sectors << 9; - if (mddev->bitmap && mddev->bitmap_file == NULL) + if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - rdev_for_each(rdev2, tmp, mddev) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + int is_active = test_bit(In_sync, &rdev2->flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -954,16 +1290,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<<MD_DISK_FAULTY); - else if (test_bit(In_sync, &rdev2->flags)) { + else if (is_active) { d->state = (1<<MD_DISK_ACTIVE); - d->state |= (1<<MD_DISK_SYNC); + if (test_bit(In_sync, &rdev2->flags)) + d->state |= (1<<MD_DISK_SYNC); active++; working++; } else { @@ -999,21 +1335,32 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) * rdev_size_change for 0.90.0 */ static unsigned long long -super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ - if (rdev->mddev->bitmap_offset) + if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; } +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} /* * version 1 superblock @@ -1026,12 +1373,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; - int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; - for (i=0; size>=4; size -= 4 ) + for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) @@ -1042,11 +1388,14 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } -static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; + sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; @@ -1060,7 +1409,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) */ switch(minor_version) { case 0: - sb_start = rdev->bdev->bd_inode->i_size >> 9; + sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; sb_start -= 8*2; sb_start &= ~(sector_t)(4*2-1); break; @@ -1082,7 +1431,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (ret) return ret; - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || @@ -1101,42 +1450,83 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { - if (sb->level != cpu_to_le32(1) && - sb->level != cpu_to_le32(4) && - sb->level != cpu_to_le32(5) && - sb->level != cpu_to_le32(6) && - sb->level != cpu_to_le32(10)) { - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } - } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; else rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + u64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, READ, true)) + return -EIO; + bbp = (u64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (md_set_badblocks(&rdev->badblocks, + sector, count, 1) == 0) + return -EINVAL; + } + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; + if (!refdev) { ret = 1; } else { __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = - (struct mdp_superblock_1*)page_address(refdev->sb_page); + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || @@ -1156,82 +1546,110 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else ret = 0; } - if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; - else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) - return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; - if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); - - if (le64_to_cpu(sb->size) > rdev->size*2) + if (minor_version) { + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); + sectors -= rdev->data_offset; + } else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; + rdev->sectors = le64_to_cpu(sb->data_size); return ret; } -static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) { - struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; - mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = 1024 >> 9; - + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* Default location for bitmap is 1K after superblock + * using 3K - total of 4K + */ + mddev->bitmap_info.default_offset = 1024 >> 9; + mddev->bitmap_info.default_space = (4096-1024) >> 9; + mddev->reshape_backwards = 0; + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_file == NULL ) - mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + mddev->bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); + /* Metadata doesn't record how much space is available. + * For 1.0, we assume we can use up to the superblock + * if before, else to 4K beyond superblock. + * For others, assume no change is possible. + */ + if (mddev->minor_version > 0) + mddev->bitmap_info.space = 0; + else if (mddev->bitmap_info.offset > 0) + mddev->bitmap_info.space = + 8 - mddev->bitmap_info.offset; + else + mddev->bitmap_info.space = + -mddev->bitmap_info.offset; + } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + if (mddev->delta_disks < 0 || + (mddev->delta_disks == 0 && + (le32_to_cpu(sb->feature_map) + & MD_FEATURE_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling */ + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1239,7 +1657,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; @@ -1247,37 +1670,40 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: + rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) + MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; } -static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; - struct list_head *tmp; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); - memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1290,19 +1716,36 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); - if (mddev->bitmap && mddev->bitmap_file == NULL) { - sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); - } + !test_bit(In_sync, &rdev->flags)) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); + } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); @@ -1310,26 +1753,78 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + if (mddev->delta_disks == 0 && + mddev->reshape_backwards) + sb->feature_map + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } + } + + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + u64 *bbp = (u64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = p[i]; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + bbp[i] = cpu_to_le64(store_bb); + } + bb->changed = 0; + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + } } max_dev = 0; - rdev_for_each(rdev2, tmp, mddev) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } else + max_dev = le32_to_cpu(sb->max_dev); + for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - rdev_for_each(rdev2, tmp, mddev) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1339,39 +1834,75 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } static unsigned long long -super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = rdev->bdev->bd_inode->i_size >> 9; + max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - } else if (rdev->mddev->bitmap_offset) { + } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { /* minor version 0; superblock after data */ sector_t sb_start; - sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; + sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } - sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; } static struct super_type super_types[] = { @@ -1382,6 +1913,7 @@ static struct super_type super_types[] = { .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", @@ -1390,12 +1922,25 @@ static struct super_type super_types[] = { .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, }, }; -static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +static void sync_super(struct mddev *mddev, struct md_rdev *rdev) { - mdk_rdev_t *rdev, *rdev2; + if (mddev->sync_super) { + mddev->sync_super(mddev, rdev); + return; + } + + BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); + + super_types[mddev->major_version].sync_super(mddev, rdev); +} + +static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) +{ + struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) @@ -1411,7 +1956,84 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(struct mddev *mddev) +{ + struct md_rdev *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register, or already is */ + rdev_for_each(rdev, mddev) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + printk(KERN_ERR "md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); + + if (!bi_mddev) /* nothing to do */ + return; + if (rdev->raid_disk < 0) /* skip spares */ + return; + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); +} +EXPORT_SYMBOL(md_integrity_add_rdev); + +static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) { char b[BDEVNAME_SIZE]; struct kobject *ko; @@ -1427,8 +2049,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1437,7 +2060,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1454,6 +2077,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev_nr(mddev, rdev->desc_nr)) return -EBUSY; } + if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + printk(KERN_WARNING "md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); + return -EBUSY; + } bdevname(rdev->bdev,b); while ( (s=strchr(b, '/')) != NULL) *s = '!'; @@ -1464,16 +2092,17 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - if (rdev->bdev->bd_part) - ko = &rdev->bdev->bd_part->dev.kobj; - else - ko = &rdev->bdev->bd_disk->dev.kobj; - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { - kobject_del(&rdev->kobj); - goto fail; - } + ko = &part_to_dev(rdev->bdev->bd_part)->kobj; + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + list_add_rcu(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); + bd_link_disk_holder(rdev->bdev, mddev->gendisk); + + /* May as well allow recovery to be retried once */ + mddev->recovery_disabled++; + return 0; fail: @@ -1484,24 +2113,26 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) static void md_delayed_delete(struct work_struct *ws) { - mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); + struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); kobject_del(&rdev->kobj); kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(mdk_rdev_t * rdev) +static void unbind_rdev_from_array(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; if (!rdev->mddev) { MD_BUG(); return; } - bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); - + sysfs_put(rdev->sysfs_state); + rdev->sysfs_state = NULL; + rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -1509,7 +2140,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); - schedule_work(&rdev->del_work); + queue_work(md_misc_wq, &rdev->del_work); } /* @@ -1517,51 +2148,42 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) +static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); - if (err) { - printk(KERN_ERR "md: could not bd_claim %s.\n", - bdevname(bdev, b)); - blkdev_put(bdev); - return err; - } - if (!shared) - set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } -static void unlock_rdev(mdk_rdev_t *rdev) +static void unlock_rdev(struct md_rdev *rdev) { struct block_device *bdev = rdev->bdev; rdev->bdev = NULL; if (!bdev) MD_BUG(); - bd_release(bdev); - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } void md_autodetect_dev(dev_t dev); -static void export_rdev(mdk_rdev_t * rdev) +static void export_rdev(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); if (rdev->mddev) MD_BUG(); - free_disk_sb(rdev); + md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -1570,18 +2192,17 @@ static void export_rdev(mdk_rdev_t * rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(mdk_rdev_t * rdev) +static void kick_rdev_from_array(struct md_rdev * rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); } -static void export_array(mddev_t *mddev) +static void export_array(struct mddev *mddev) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -1600,7 +2221,7 @@ static void print_desc(mdp_disk_t *desc) desc->major,desc->minor,desc->raid_disk,desc->state); } -static void print_sb(mdp_super_t *sb) +static void print_sb_90(mdp_super_t *sb) { int i; @@ -1631,28 +2252,76 @@ static void print_sb(mdp_super_t *sb) } printk(KERN_INFO "md: THIS: "); print_desc(&sb->this_disk); - } -static void print_rdev(mdk_rdev_t *rdev) +static void print_sb_1(struct mdp_superblock_1 *sb) +{ + __u8 *uuid; + + uuid = sb->set_uuid; + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" + "md: Name: \"%s\" CT:%llu\n", + le32_to_cpu(sb->major_version), + le32_to_cpu(sb->feature_map), + uuid, + sb->set_name, + (unsigned long long)le64_to_cpu(sb->ctime) + & MD_SUPERBLOCK_1_TIME_SEC_MASK); + + uuid = sb->device_uuid; + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + " RO:%llu\n" + "md: Dev:%08x UUID: %pU\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", + le32_to_cpu(sb->level), + (unsigned long long)le64_to_cpu(sb->size), + le32_to_cpu(sb->raid_disks), + le32_to_cpu(sb->layout), + le32_to_cpu(sb->chunksize), + (unsigned long long)le64_to_cpu(sb->data_offset), + (unsigned long long)le64_to_cpu(sb->data_size), + (unsigned long long)le64_to_cpu(sb->super_offset), + (unsigned long long)le64_to_cpu(sb->recovery_offset), + le32_to_cpu(sb->dev_number), + uuid, + sb->devflags, + (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, + (unsigned long long)le64_to_cpu(sb->events), + (unsigned long long)le64_to_cpu(sb->resync_offset), + le32_to_cpu(sb->sb_csum), + le32_to_cpu(sb->max_dev) + ); +} + +static void print_rdev(struct md_rdev *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock:\n"); - print_sb((mdp_super_t*)page_address(rdev->sb_page)); + printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); + switch (major_version) { + case 0: + print_sb_90(page_address(rdev->sb_page)); + break; + case 1: + print_sb_1(page_address(rdev->sb_page)); + break; + } } else printk(KERN_INFO "md: no rdev superblock!\n"); } static void md_print_devices(void) { - struct list_head *tmp, *tmp2; - mdk_rdev_t *rdev; - mddev_t *mddev; + struct list_head *tmp; + struct md_rdev *rdev; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk("\n"); @@ -1665,19 +2334,19 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, tmp2, mddev) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - rdev_for_each(rdev, tmp2, mddev) - print_rdev(rdev); + rdev_for_each(rdev, mddev) + print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); printk("\n"); } -static void sync_sbs(mddev_t * mddev, int nospares) +static void sync_sbs(struct mddev * mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already @@ -1685,38 +2354,67 @@ static void sync_sbs(mddev_t * mddev, int nospares) * (which would mean they aren't being marked as dirty * with the rest of the array) */ - mdk_rdev_t *rdev; - struct list_head *tmp; - - rdev_for_each(rdev, tmp, mddev) { + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && - (rdev->sb_events&1)==0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; } else { - super_types[mddev->major_version]. - sync_super(mddev, rdev); + sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } -static void md_update_sb(mddev_t * mddev, int force_change) +static void md_update_sb(struct mddev * mddev, int force_change) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev; int sync_req; int nospares = 0; + int any_badblocks_changed = 0; - if (mddev->external) + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); return; + } repeat: + /* First make sure individual recovery_offsets are correct */ + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!mddev->external) { + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; + md_ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } + wake_up(&mddev->sb_wait); + return; + } + spin_lock_irq(&mddev->write_lock); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + mddev->utime = get_seconds(); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) @@ -1740,31 +2438,19 @@ repeat: nospares = 0; sync_req = mddev->in_sync; - mddev->utime = get_seconds(); /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && (mddev->events & 1) - && mddev->events != 1) + && mddev->can_decrease_events + && mddev->events != 1) { mddev->events--; - else { + mddev->can_decrease_events = 0; + } else { /* otherwise we have to go forward and ... */ mddev->events ++; - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; - nospares = 0; - } - } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; - nospares = 0; - } - } + mddev->can_decrease_events = nospares; } if (!mddev->events) { @@ -1777,46 +2463,46 @@ repeat: mddev->events --; } - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (!mddev->persistent) { - if (!mddev->external) - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - return; + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); - dprintk(KERN_INFO - "md: updating %s RAID superblock on device (in sync %d)\n", - mdname(mddev),mddev->in_sync); + pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", + mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; - dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (test_bit(Faulty, &rdev->flags)) - dprintk("(skipping faulty "); - dprintk("%s ", bdevname(rdev->bdev,b)); if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); - dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sb_start); + pr_debug("md: (write) %s's sb offset: %llu\n", + bdevname(rdev->bdev, b), + (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } } else - dprintk(")\n"); + pr_debug("md: %s (skipping faulty)\n", + bdevname(rdev->bdev, b)); + if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ break; @@ -1834,7 +2520,18 @@ repeat: clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + if (any_badblocks_changed) + md_ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } } /* words written to sysfs files may, or may not, be \n terminated. @@ -1859,17 +2556,18 @@ static int cmd_match(const char *cmd, const char *str) struct rdev_sysfs_entry { struct attribute attr; - ssize_t (*show)(mdk_rdev_t *, char *); - ssize_t (*store)(mdk_rdev_t *, const char *, size_t); + ssize_t (*show)(struct md_rdev *, char *); + ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t -state_show(mdk_rdev_t *rdev, char *page) +state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; - if (test_bit(Faulty, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags) || + rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } @@ -1881,7 +2579,9 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags)) { + if (test_bit(Blocked, &rdev->flags) || + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &rdev->flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } @@ -1890,29 +2590,50 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sspare", sep); sep = ","; } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + len += sprintf(page+len, "%swrite_error", sep); + sep = ","; + } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); } static ssize_t -state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write - * faulty - simulates and error + * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly - * blocked - sets the Blocked flag - * -blocked - clears the Blocked flag + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error + * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - err = 0; + if (test_bit(Faulty, &rdev->flags)) + err = 0; + else + err = -EBUSY; } else if (cmd_match(buf, "remove")) { if (rdev->raid_disk >= 0) err = -EBUSY; else { - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; kick_rdev_from_array(rdev); if (mddev->pers) md_update_sb(mddev, 1); @@ -1929,28 +2650,86 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } } if (!err) - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t -errors_show(mdk_rdev_t *rdev, char *page) +errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t -errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) +errors_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -1964,7 +2743,7 @@ static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t -slot_show(mdk_rdev_t *rdev, char *page) +slot_show(struct md_rdev *rdev, char *page) { if (rdev->raid_disk < 0) return sprintf(page, "none\n"); @@ -1973,11 +2752,10 @@ slot_show(mdk_rdev_t *rdev, char *page) } static ssize_t -slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) +slot_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; int err; - char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; @@ -1994,62 +2772,59 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ - if (rdev->mddev->pers->hot_add_disk == NULL) + if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev->raid_disk); - if (err) - return err; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&rdev->mddev->kobj, nm); + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { - mdk_rdev_t *rdev2; - struct list_head *tmp; /* Activating a spare .. or possibly reactivating - * if we every get bitmaps working here. + * if we ever get bitmaps working here. */ if (rdev->raid_disk != -1) return -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; - rdev_for_each(rdev2, tmp, rdev->mddev) - if (rdev2->raid_disk == slot) - return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else - sysfs_notify(&rdev->kobj, NULL, "state"); - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(rdev->mddev)); - + sysfs_notify_dirent_safe(rdev->sysfs_state); + if (sysfs_link_rdev(rdev->mddev, rdev)) + /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } @@ -2059,35 +2834,92 @@ static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t -offset_show(mdk_rdev_t *rdev, char *page) +offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t -offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) +offset_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) + unsigned long long offset; + if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; rdev->data_offset = offset; + rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (kstrtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + static ssize_t -rdev_size_show(mdk_rdev_t *rdev, char *page) +rdev_size_show(struct md_rdev *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2100,54 +2932,74 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } -static ssize_t -rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) { - unsigned long long size; - unsigned long long oldsize = rdev->size; - mddev_t *my_mddev = rdev->mddev; + unsigned long long blocks; + sector_t new; - if (strict_strtoull(buf, 10, &size) < 0) + if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; - if (size < my_mddev->size) + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + +static ssize_t +rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + struct mddev *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; + + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } - if (size < my_mddev->size) - return -EINVAL; /* component must fit device */ + } else if (!sectors) + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; } + if (sectors < my_mddev->dev_sectors) + return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ - mddev_t *mddev; + struct mddev *mddev; int overlap = 0; - struct list_head *tmp, *tmp2; + struct list_head *tmp; mddev_unlock(my_mddev); for_each_mddev(mddev, tmp) { - mdk_rdev_t *rdev2; - - mddev_lock(mddev); - rdev_for_each(rdev2, tmp2, mddev) - if (test_bit(AllReserved, &rdev2->flags) || - (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, - rdev2->data_offset, - rdev2->size * 2))) { + struct md_rdev *rdev2; + + mddev_lock_nointr(mddev); + rdev_for_each(rdev2, mddev) + if (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->sectors, + rdev2->data_offset, + rdev2->sectors)) { overlap = 1; break; } @@ -2157,15 +3009,15 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -2175,20 +3027,93 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + +static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (kstrtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, + &rdev_new_offset.attr, &rdev_size.attr, + &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, NULL, }; static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct mddev *mddev = rdev->mddev; ssize_t rv; if (!entry->show) @@ -2210,9 +3135,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); ssize_t rv; - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; if (!entry->store) return -EIO; @@ -2231,10 +3156,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, static void rdev_free(struct kobject *ko) { - mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } -static struct sysfs_ops rdev_sysfs_ops = { +static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; @@ -2244,6 +3169,40 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; +int md_rdev_init(struct md_rdev *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->new_data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + seqlock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -2254,11 +3213,11 @@ static struct kobj_type rdev_ktype = { * * a faulty rdev _never_ has rdev->sb set. */ -static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; sector_t size; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); @@ -2267,7 +3226,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - if ((err = alloc_disk_sb(rdev))) + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) goto abort_free; err = lock_rdev(rdev, newdev, super_format == -2); @@ -2276,17 +3239,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING "md: %s has zero or unknown size, marking faulty!\n", @@ -2314,17 +3267,12 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi } } - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - return rdev; abort_free: - if (rdev->sb_page) { - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - } + if (rdev->bdev) + unlock_rdev(rdev); + md_rdev_clear(rdev); kfree(rdev); return ERR_PTR(err); } @@ -2334,15 +3282,14 @@ abort_free: */ -static void analyze_sbs(mddev_t * mddev) +static void analyze_sbs(struct mddev * mddev) { int i; - struct list_head *tmp; - mdk_rdev_t *rdev, *freshest; + struct md_rdev *rdev, *freshest, *tmp; char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2363,7 +3310,17 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { + printk(KERN_WARNING + "md: %s: %s: only %d devices permitted\n", + mdname(mddev), bdevname(rdev->bdev, b), + mddev->max_disks); + kick_rdev_from_array(rdev); + continue; + } if (rdev != freshest) if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { @@ -2377,60 +3334,69 @@ static void analyze_sbs(mddev_t * mddev) rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= mddev->raid_disks) { + } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } +} - - - if (mddev->recovery_cp != MaxSector && - mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + while (decimals < scale) { + result *= 10; + decimals ++; + } + *res = result; + return 0; } + static void md_safemode_timeout(unsigned long data); static ssize_t -safe_delay_show(mddev_t *mddev, char *page) +safe_delay_show(struct mddev *mddev, char *page) { int msec = (mddev->safemode_delay*1000)/HZ; return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); } static ssize_t -safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { - int scale=1; - int dot=0; - int i; unsigned long msec; - char buf[30]; - char *e; - /* remove a period, and count digits after it */ - if (len >= sizeof(buf)) - return -EINVAL; - strlcpy(buf, cbuf, len); - buf[len] = 0; - for (i=0; i<len; i++) { - if (dot) { - if (isdigit(buf[i])) { - buf[i-1] = buf[i]; - scale *= 10; - } - buf[i] = 0; - } else if (buf[i] == '.') { - dot=1; - buf[i] = 0; - } - } - msec = simple_strtoul(buf, &e, 10); - if (e == buf || (*e && *e != '\n')) + + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; - msec = (msec * 1000) / scale; if (msec == 0) mddev->safemode_delay = 0; else { @@ -2438,7 +3404,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) mddev->safemode_delay = (msec*HZ)/1000; if (mddev->safemode_delay == 0) mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay) + if (mddev->safemode_delay < old_delay || old_delay == 0) md_safemode_timeout((unsigned long)mddev); } return len; @@ -2447,9 +3413,9 @@ static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t -level_show(mddev_t *mddev, char *page) +level_show(struct mddev *mddev, char *page) { - struct mdk_personality *p = mddev->pers; + struct md_personality *p = mddev->pers; if (p) return sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) @@ -2461,20 +3427,184 @@ level_show(mddev_t *mddev, char *page) } static ssize_t -level_store(mddev_t *mddev, const char *buf, size_t len) +level_store(struct mddev *mddev, const char *buf, size_t len) { + char clevel[16]; ssize_t rv = len; - if (mddev->pers) + struct md_personality *pers; + long level; + void *priv; + struct md_rdev *rdev; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + if (mddev->ro) + return -EROFS; + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(clevel)) + return -EINVAL; + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + clevel[len] = 0; + if (kstrtol(clevel, 10, &level)) + level = LEVEL_NONE; + + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); + spin_lock(&pers_lock); + pers = find_pers(level, clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); + return -EINVAL; + } + + rdev_for_each(rdev, mddev) + rdev->new_raid_disk = rdev->raid_disk; + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), clevel); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk >= mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sysfs_unlink_rdev(mddev, rdev); + } + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) + clear_bit(In_sync, &rdev->flags); + else { + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING "md: cannot register rd%d" + " for %s after level change\n", + rdev->raid_disk, mdname(mddev)); + } + } + + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } + blk_set_stacking_limits(&mddev->queue->limits); + pers->run(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); + sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); return rv; } @@ -2483,7 +3613,7 @@ __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t -layout_show(mddev_t *mddev, char *page) +layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && @@ -2494,7 +3624,7 @@ layout_show(mddev_t *mddev, char *page) } static ssize_t -layout_store(mddev_t *mddev, const char *buf, size_t len) +layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -2502,12 +3632,23 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; - else - mddev->layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_layout = mddev->layout; + return err; + } + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2515,7 +3656,7 @@ __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t -raid_disks_show(mddev_t *mddev, char *page) +raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; @@ -2526,10 +3667,10 @@ raid_disks_show(mddev_t *mddev, char *page) return sprintf(page, "%d\n", mddev->raid_disks); } -static int update_raid_disks(mddev_t *mddev, int raid_disks); +static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t -raid_disks_store(mddev_t *mddev, const char *buf, size_t len) +raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; int rv = 0; @@ -2541,9 +3682,20 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } mddev->delta_disks = n - olddisks; mddev->raid_disks = n; + mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; return rv ? rv : len; @@ -2552,54 +3704,71 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t -chunk_size_show(mddev_t *mddev, char *page) +chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_size != mddev->new_chunk) - return sprintf(page, "%d (%d)\n", mddev->new_chunk, - mddev->chunk_size); - return sprintf(page, "%d\n", mddev->chunk_size); + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t -chunk_size_store(mddev_t *mddev, const char *buf, size_t len) +chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - else if (mddev->reshape_position != MaxSector) - mddev->new_chunk = n; - else - mddev->chunk_size = n; + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + if (mddev->ro) + return -EROFS; + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + return err; + } + } else { + mddev->new_chunk_sectors = n >> 9; + if (mddev->reshape_position == MaxSector) + mddev->chunk_sectors = n >> 9; + } return len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t -resync_start_show(mddev_t *mddev, char *page) +resync_start_show(struct mddev *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } static ssize_t -resync_start_store(mddev_t *mddev, const char *buf, size_t len) +resync_start_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long n = simple_strtoull(buf, &e, 10); - if (mddev->pers) + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return -EBUSY; - if (!*buf || (*e && *e != '\n')) + if (cmd_match(buf, "none")) + n = MaxSector; + else if (!*buf || (*e && *e != '\n')) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = @@ -2657,7 +3826,7 @@ static int match_word(const char *word, char **list) } static ssize_t -array_state_show(mddev_t *mddev, char *page) +array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; @@ -2672,7 +3841,7 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; - else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; else if (mddev->safemode) st = active_idle; @@ -2682,7 +3851,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -2690,12 +3859,13 @@ array_state_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(mddev_t * mddev, int ro, int is_open); -static int do_md_run(mddev_t * mddev); -static int restart_array(mddev_t *mddev); +static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); +static int do_md_run(struct mddev * mddev); +static int restart_array(struct mddev *mddev); static ssize_t -array_state_store(mddev_t *mddev, const char *buf, size_t len) +array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = -EINVAL; enum array_state st = match_word(buf, array_states); @@ -2704,24 +3874,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 0, 0); + err = do_md_stop(mddev, 0, NULL); break; case inactive: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 2, 0); - } else + if (mddev->pers) + err = do_md_stop(mddev, 2, NULL); + else err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, NULL); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -2730,9 +3896,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case read_auto: if (mddev->pers) { - if (mddev->ro != 1) - err = do_md_stop(mddev, 1, 0); - else + if (mddev->ro == 0) + err = md_set_readonly(mddev, NULL); + else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { mddev->ro = 2; @@ -2752,25 +3918,19 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, - &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } err = 0; } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { restart_array(mddev); - if (mddev->external) - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2787,7 +3947,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { - sysfs_notify(&mddev->kobj, NULL, "array_state"); + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; + sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } } @@ -2795,13 +3957,36 @@ static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t -null_show(mddev_t *mddev, char *page) +max_corrected_read_errors_show(struct mddev *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&mddev->max_corr_read_errors, n); + return len; + } return -EINVAL; } +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + static ssize_t -new_dev_store(mddev_t *mddev, const char *buf, size_t len) +null_show(struct mddev *mddev, char *page) +{ + return -EINVAL; +} + +static ssize_t +new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. @@ -2814,7 +3999,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; - mdk_rdev_t *rdev; + struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') @@ -2832,8 +4017,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) @@ -2857,7 +4043,7 @@ static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t -bitmap_store(mddev_t *mddev, const char *buf, size_t len) +bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; @@ -2875,8 +4061,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len) } if (*end && !isspace(*end)) break; bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); - buf = end; - while (isspace(*buf)) buf++; + buf = skip_spaces(end); } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: @@ -2887,34 +4072,33 @@ static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t -size_show(mddev_t *mddev, char *page) +size_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } -static int update_size(mddev_t *mddev, sector_t num_sectors); +static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t -size_store(mddev_t *mddev, const char *buf, size_t len) +size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + if (err < 0) + return err; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -2925,14 +4109,14 @@ static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); -/* Metdata version. +/* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t -metadata_show(mddev_t *mddev, char *page) +metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", @@ -2944,11 +4128,17 @@ metadata_show(mddev_t *mddev, char *page) } static ssize_t -metadata_store(mddev_t *mddev, const char *buf, size_t len) +metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; - if (!list_empty(&mddev->disks)) + /* Changing the details of 'external' metadata is + * always permitted. Otherwise there must be + * no devices attached to the array. + */ + if (mddev->external && strncmp(buf, "external:", 9) == 0) + ; + else if (!list_empty(&mddev->disks)) return -EBUSY; if (cmd_match(buf, "none")) { @@ -2992,10 +4182,12 @@ static struct md_sysfs_entry md_metadata = __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t -action_show(mddev_t *mddev, char *page) +action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3013,17 +4205,20 @@ action_show(mddev_t *mddev, char *page) } static ssize_t -action_store(mddev_t *mddev, const char *page, size_t len) +action_store(struct mddev *mddev, const char *page, size_t len) { if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3049,34 +4244,49 @@ action_store(mddev_t *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + static ssize_t -mismatch_cnt_show(mddev_t *mddev, char *page) +last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + return sprintf(page, "%s\n", mddev->last_sync_action); } -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); +static ssize_t +mismatch_cnt_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); +} static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t -sync_min_show(mddev_t *mddev, char *page) +sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local": "system"); } static ssize_t -sync_min_store(mddev_t *mddev, const char *buf, size_t len) +sync_min_store(struct mddev *mddev, const char *buf, size_t len) { int min; char *e; @@ -3095,14 +4305,14 @@ static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t -sync_max_show(mddev_t *mddev, char *page) +sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local": "system"); } static ssize_t -sync_max_store(mddev_t *mddev, const char *buf, size_t len) +sync_max_store(struct mddev *mddev, const char *buf, size_t len) { int max; char *e; @@ -3121,24 +4331,24 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t -degraded_show(mddev_t *mddev, char *page) +degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t -sync_force_parallel_show(mddev_t *mddev, char *page) +sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t -sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) +sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; - if (strict_strtol(buf, 10, &n)) + if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) @@ -3158,9 +4368,11 @@ __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t -sync_speed_show(mddev_t *mddev, char *page) +sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; @@ -3171,32 +4383,40 @@ sync_speed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t -sync_completed_show(mddev_t *mddev, char *page) +sync_completed_show(struct mddev *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long long max_sectors, resync; + + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); + + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + resync = mddev->curr_resync_completed; + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t -min_sync_show(mddev_t *mddev, char *page) +min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t -min_sync_store(mddev_t *mddev, const char *buf, size_t len) +min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; - if (strict_strtoull(buf, 10, &min)) + if (kstrtoull(buf, 10, &min)) return -EINVAL; if (min > mddev->resync_max) return -EINVAL; @@ -3204,8 +4424,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (min & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = min; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_min = min; @@ -3217,7 +4438,7 @@ static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t -max_sync_show(mddev_t *mddev, char *page) +max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); @@ -3226,23 +4447,25 @@ max_sync_show(mddev_t *mddev, char *page) (unsigned long long)mddev->resync_max); } static ssize_t -max_sync_store(mddev_t *mddev, const char *buf, size_t len) +max_sync_store(struct mddev *mddev, const char *buf, size_t len) { if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; - if (strict_strtoull(buf, 10, &max)) + if (kstrtoull(buf, 10, &max)) return -EINVAL; if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (max & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = max; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_max = max; @@ -3255,63 +4478,74 @@ static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t -suspend_lo_show(mddev_t *mddev, char *page) +suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); } static ssize_t -suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) +suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t -suspend_hi_show(mddev_t *mddev, char *page) +suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); } static ssize_t -suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) +suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t -reshape_position_show(mddev_t *mddev, char *page) +reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", @@ -3321,8 +4555,9 @@ reshape_position_show(mddev_t *mddev, char *page) } static ssize_t -reshape_position_store(mddev_t *mddev, const char *buf, size_t len) +reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { + struct md_rdev *rdev; char *e; unsigned long long new = simple_strtoull(buf, &e, 10); if (mddev->pers) @@ -3331,9 +4566,12 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; mddev->reshape_position = new; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; return len; } @@ -3341,6 +4579,84 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +reshape_direction_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", + mddev->reshape_backwards ? "backwards" : "forwards"); +} + +static ssize_t +reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) +{ + int backwards = 0; + if (cmd_match(buf, "forwards")) + backwards = 0; + else if (cmd_match(buf, "backwards")) + backwards = 1; + else + return -EINVAL; + if (mddev->reshape_backwards == backwards) + return len; + + /* check if we are allowed to change */ + if (mddev->delta_disks) + return -EBUSY; + + if (mddev->persistent && + mddev->major_version == 0) + return -EINVAL; + + mddev->reshape_backwards = backwards; + return len; +} + +static struct md_sysfs_entry md_reshape_direction = +__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, + reshape_direction_store); + +static ssize_t +array_size_show(struct mddev *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -E2BIG; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3354,11 +4670,15 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_reshape_direction.attr, + &md_array_size.attr, + &max_corr_read_errors.attr, NULL, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, + &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, @@ -3383,16 +4703,25 @@ static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + rv = mddev_lock(mddev); if (!rv) { rv = entry->show(mddev, page); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } @@ -3401,28 +4730,49 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + if (entry->store == new_dev_store) + flush_workqueue(md_misc_wq); rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } static void md_free(struct kobject *ko) { - mddev_t *mddev = container_of(ko, mddev_t, kobj); + struct mddev *mddev = container_of(ko, struct mddev, kobj); + + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + + if (mddev->gendisk) { + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); + } + if (mddev->queue) + blk_cleanup_queue(mddev->queue); + kfree(mddev); } -static struct sysfs_ops md_sysfs_ops = { +static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; @@ -3434,76 +4784,164 @@ static struct kobj_type md_ktype = { int mdp_major = 0; -static struct kobject *md_probe(dev_t dev, int *part, void *data) +static void mddev_delayed_delete(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + sysfs_remove_group(&mddev->kobj, &md_bitmap_group); + kobject_del(&mddev->kobj); + kobject_put(&mddev->kobj); +} + +static int md_alloc(dev_t dev, char *name) { static DEFINE_MUTEX(disks_mutex); - mddev_t *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find(dev); struct gendisk *disk; - int partitioned = (MAJOR(dev) != MD_MAJOR); - int shift = partitioned ? MdpMinorShift : 0; - int unit = MINOR(dev) >> shift; + int partitioned; + int shift; + int unit; int error; if (!mddev) - return NULL; + return -ENODEV; + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; + + /* wait for any previous instance of this device to be + * completely removed (mddev_delayed_delete). + */ + flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - if (mddev->gendisk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return NULL; + error = -EEXIST; + if (mddev->gendisk) + goto abort; + + if (name) { + /* Need to ensure that 'name' is not a duplicate. + */ + struct mddev *mddev2; + spin_lock(&all_mddevs_lock); + + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) + if (mddev2->gendisk && + strcmp(mddev2->gendisk->disk_name, name) == 0) { + spin_unlock(&all_mddevs_lock); + goto abort; + } + spin_unlock(&all_mddevs_lock); } + + error = -ENOMEM; + mddev->queue = blk_alloc_queue(GFP_KERNEL); + if (!mddev->queue) + goto abort; + mddev->queue->queuedata = mddev; + + blk_queue_make_request(mddev->queue, md_make_request); + blk_set_stacking_limits(&mddev->queue->limits); + disk = alloc_disk(1 << shift); if (!disk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return NULL; + blk_cleanup_queue(mddev->queue); + mddev->queue = NULL; + goto abort; } - disk->major = MAJOR(dev); + disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; - if (partitioned) + if (name) + strcpy(disk->disk_name, name); + else if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); else sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - add_disk(disk); + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + /* Allow extended partitions. This makes the + * 'mdp' device redundant, but we can't really + * remove it now. + */ + disk->flags |= GENHD_FL_EXT_DEVT; mddev->gendisk = disk; - error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, - "%s", "md"); - mutex_unlock(&disks_mutex); - if (error) + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + + error = kobject_init_and_add(&mddev->kobj, &md_ktype, + &disk_to_dev(disk)->kobj, "%s", "md"); + if (error) { + /* This isn't possible, but as kobject_init_and_add is marked + * __must_check, we must do something with the result + */ printk(KERN_WARNING "md: cannot register %s/md - name in use\n", disk->disk_name); - else + error = 0; + } + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + printk(KERN_DEBUG "pointless warning\n"); + mutex_unlock(&mddev->open_mutex); + abort: + mutex_unlock(&disks_mutex); + if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + } + mddev_put(mddev); + return error; +} + +static struct kobject *md_probe(dev_t dev, int *part, void *data) +{ + md_alloc(dev, NULL); return NULL; } +static int add_named_array(const char *val, struct kernel_param *kp) +{ + /* val must be "md_*" where * is not all digits. + * We allocate an array with a large free minor number, and + * set the name to val. val must not already be an active name. + */ + int len = strlen(val); + char buf[DISK_NAME_LEN]; + + while (len && val[len-1] == '\n') + len--; + if (len >= DISK_NAME_LEN) + return -E2BIG; + strlcpy(buf, val, len+1); + if (strncmp(buf, "md_", 3) != 0) + return -EINVAL; + return md_alloc(0, buf); +} + static void md_safemode_timeout(unsigned long data) { - mddev_t *mddev = (mddev_t *) data; + struct mddev *mddev = (struct mddev *) data; if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); + sysfs_notify_dirent_safe(mddev->sysfs_state); } md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +int md_run(struct mddev *mddev) { int err; - int chunk_size; - struct list_head *tmp; - mdk_rdev_t *rdev; - struct gendisk *disk; - struct mdk_personality *pers; - char b[BDEVNAME_SIZE]; + struct md_rdev *rdev; + struct md_personality *pers; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -3511,6 +4949,9 @@ static int do_md_run(mddev_t * mddev) if (mddev->pers) return -EBUSY; + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; /* * Analyze all RAID superblock(s) @@ -3521,56 +4962,17 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_size; - - if (chunk_size) { - if (chunk_size > MAX_CHUNK_SIZE) { - printk(KERN_ERR "too big chunk_size: %d > %d\n", - chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* - * chunk-size has to be a power of 2 and multiples of PAGE_SIZE - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); - return -EINVAL; - } - if (chunk_size < PAGE_SIZE) { - printk(KERN_ERR "too small chunk_size: %d < %ld\n", - chunk_size, PAGE_SIZE); - return -EINVAL; - } - - /* devices must have minimum size of one chunk */ - rdev_for_each(rdev, tmp, mddev) { - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->size < chunk_size / 1024) { - printk(KERN_WARNING - "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); - return -EINVAL; - } - } - } - -#ifdef CONFIG_KMOD if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) request_module("md-%s", mddev->clevel); -#endif /* * Drop all container device buffers, from now on * the only valid external interface is through the md * device. */ - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -3578,11 +4980,13 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3596,13 +5000,11 @@ static int do_md_run(mddev_t * mddev) return -EINVAL; } } - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); } - md_probe(mddev->unit, NULL, NULL); - disk = mddev->gendisk; - if (!disk) - return -ENOMEM; + if (mddev->bio_set == NULL) + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -3618,7 +5020,10 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && @@ -3634,11 +5039,11 @@ static int do_md_run(mddev_t * mddev) * configuration. */ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev2; - struct list_head *tmp2; + struct md_rdev *rdev2; int warned = 0; - rdev_for_each(rdev, tmp, mddev) { - rdev_for_each(rdev2, tmp2, mddev) { + + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -3652,7 +5057,7 @@ static int do_md_run(mddev_t * mddev) warned = 1; } } - } + if (warned) printk(KERN_WARNING "True protection against single-disk" @@ -3660,17 +5065,29 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ - mddev->barriers_work = 1; + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly) + if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request && + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -3685,87 +5102,68 @@ static int do_md_run(mddev_t * mddev) return err; } if (mddev->pers->sync_request) { - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - - rdev_for_each(rdev, tmp, mddev) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s\n", - nm, mdname(mddev)); - } + smp_wmb(); + mddev->ready = 1; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_sectors); + md_new_event(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); + return 0; +} +EXPORT_SYMBOL_GPL(md_run); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; +static int do_md_run(struct mddev *mddev) +{ + int err; - /* If there is a partially-recovered drive we need to - * start recovery here. If we leave it to md_check_recovery, - * it will remove the drives and not do the right thing - */ - if (mddev->degraded && !mddev->sync_thread) { - struct list_head *rtmp; - int spares = 0; - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) - /* complete an interrupted recovery */ - spares++; - if (spares && mddev->pers->sync_request) { - mddev->recovery = 0; - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "%s_resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; - } - } + err = md_run(mddev); + if (err) + goto out; + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; } + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); mddev->changed = 1; - md_new_event(mddev); - sysfs_notify(&mddev->kobj, NULL, "array_state"); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); - sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); - return 0; + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; } -static int restart_array(mddev_t *mddev) +static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; @@ -3785,182 +5183,252 @@ static int restart_array(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) +static void md_clean(struct mddev *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + atomic64_set(&mddev->resync_mismatches, 0); + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->safemode = 0; + mddev->merge_check_needed = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.default_space = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; +} + +static void __md_stop_writes(struct mddev *mddev) { - struct inode *inode = file->f_mapping->host; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + + del_timer_sync(&mddev->safemode_timer); - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; + bitmap_flush(mddev); + md_super_wait(mddev); + + if (mddev->ro == 0 && + (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); +} - return 0; +void md_stop_writes(struct mddev *mddev) +{ + mddev_lock_nointr(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); } +EXPORT_SYMBOL_GPL(md_stop_writes); -static void restore_bitmap_write_access(struct file *file) +static void __md_stop(struct mddev *mddev) { - struct inode *inode = file->f_mapping->host; + mddev->ready = 0; + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + +EXPORT_SYMBOL_GPL(md_stop); + +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) +{ + int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + err = -EBUSY; + goto out; + } + if (mddev->pers) { + __md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; } /* mode: * 0 - completely stop and dis-assemble array - * 1 - switch to readonly * 2 - stop but do not disassemble array */ -static int do_md_stop(mddev_t * mddev, int mode, int is_open) +static int do_md_stop(struct mddev * mddev, int mode, + struct block_device *bdev) { - int err = 0; struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + int did_freeze = 0; - if (atomic_read(&mddev->openers) > is_open) { + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } - if (mddev->pers) { + if (mddev->ro) + set_disk_ro(disk, 0); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - } - - del_timer_sync(&mddev->safemode_timer); - - switch(mode) { - case 1: /* readonly */ - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - break; - case 0: /* disassemble */ - case 2: /* stop */ - bitmap_flush(mddev); - md_super_wait(mddev); - if (mddev->ro) - set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); - mddev->pers->stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - if (mddev->pers->sync_request) - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + __md_stop_writes(mddev); + __md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; - module_put(mddev->pers->owner); - mddev->pers = NULL; - /* tell userspace to handle 'inactive' */ - sysfs_notify(&mddev->kobj, NULL, "array_state"); + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent_safe(mddev->sysfs_state); - set_capacity(disk, 0); - mddev->changed = 1; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); - if (mddev->ro) - mddev->ro = 0; - } - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } - if (mode == 1) - set_disk_ro(disk, 1); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } + set_capacity(disk, 0); + mutex_unlock(&mddev->open_mutex); + mddev->changed = 1; + revalidate_disk(disk); + if (mddev->ro) + mddev->ro = 0; + } else + mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; - struct list_head *tmp; - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) { + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } - mddev->bitmap_offset = 0; - - rdev_for_each(rdev, tmp, mddev) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - - /* make sure all md_delayed_delete calls have finished */ - flush_scheduled_work(); + mddev->bitmap_info.offset = 0; export_array(mddev); - mddev->array_sectors = 0; - mddev->size = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; - mddev->metadata_type[0] = 0; - mddev->chunk_size = 0; - mddev->ctime = mddev->utime = 0; - mddev->layout = 0; - mddev->max_disks = 0; - mddev->events = 0; - mddev->delta_disks = 0; - mddev->new_level = LEVEL_NONE; - mddev->new_layout = 0; - mddev->new_chunk = 0; - mddev->curr_resync = 0; - mddev->resync_mismatches = 0; - mddev->suspend_lo = mddev->suspend_hi = 0; - mddev->sync_speed_min = mddev->sync_speed_max = 0; - mddev->recovery = 0; - mddev->in_sync = 0; - mddev->changed = 0; - mddev->degraded = 0; - mddev->barriers_work = 0; - mddev->safemode = 0; - - } else if (mddev->pers) - printk(KERN_INFO "md: %s switched to read-only mode.\n", - mdname(mddev)); - err = 0; + md_clean(mddev); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + if (mddev->hold_active == UNTIL_STOP) + mddev->hold_active = 0; + } + blk_integrity_unregister(disk); md_new_event(mddev); - sysfs_notify(&mddev->kobj, NULL, "array_state"); -out: - return err; + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; } #ifndef MODULE -static void autorun_array(mddev_t *mddev) +static void autorun_array(struct mddev *mddev) { - mdk_rdev_t *rdev; - struct list_head *tmp; + struct md_rdev *rdev; int err; if (list_empty(&mddev->disks)) @@ -3968,16 +5436,16 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } printk("\n"); - err = do_md_run (mddev); + err = do_md_run(mddev); if (err) { printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop (mddev, 0, 0); + do_md_stop(mddev, 0, NULL); } } @@ -3995,9 +5463,8 @@ static void autorun_array(mddev_t *mddev) */ static void autorun_devices(int part) { - struct list_head *tmp; - mdk_rdev_t *rdev0, *rdev; - mddev_t *mddev; + struct md_rdev *rdev0, *rdev, *tmp; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: autorun ...\n"); @@ -4006,12 +5473,12 @@ static void autorun_devices(int part) dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, - mdk_rdev_t, same_set); + struct md_rdev, same_set); printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); - rdev_for_each_list(rdev, tmp, pending_raid_disks) + rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { printk(KERN_INFO "md: adding %s ...\n", bdevname(rdev->bdev,b)); @@ -4057,7 +5524,7 @@ static void autorun_devices(int part) } else { printk(KERN_INFO "md: created %s\n", mdname(mddev)); mddev->persistent = 1; - rdev_for_each_list(rdev, tmp, candidates) { + rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -4068,7 +5535,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - rdev_for_each_list(rdev, tmp, candidates) { + rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); export_rdev(rdev); } @@ -4092,34 +5559,35 @@ static int get_version(void __user * arg) return 0; } -static int get_array_info(mddev_t * mddev, void __user * arg) +static int get_array_info(struct mddev * mddev, void __user * arg) { mdu_array_info_t info; - int nr,working,active,failed,spare; - mdk_rdev_t *rdev; - struct list_head *tmp; + int nr,working,insync,failed,spare; + struct md_rdev *rdev; - nr=working=active=failed=spare=0; - rdev_for_each(rdev, tmp, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; if (test_bit(In_sync, &rdev->flags)) - active++; + insync++; else spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4130,15 +5598,15 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); - if (mddev->bitmap && mddev->bitmap_offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); - info.active_disks = active; + if (mddev->bitmap && mddev->bitmap_info.offset) + info.state |= (1<<MD_SB_BITMAP_PRESENT); + info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; info.spare_disks = spare; info.layout = mddev->layout; - info.chunk_size = mddev->chunk_size; + info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4146,22 +5614,19 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } -static int get_bitmap_file(mddev_t * mddev, void __user * arg) +static int get_bitmap_file(struct mddev * mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; int err = -ENOMEM; - if (md_allow_write(mddev)) - file = kmalloc(sizeof(*file), GFP_NOIO); - else - file = kmalloc(sizeof(*file), GFP_KERNEL); + file = kmalloc(sizeof(*file), GFP_NOIO); if (!file) goto out; /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->file) { + if (!mddev->bitmap || !mddev->bitmap->storage.file) { file->pathname[0] = '\0'; goto copy_out; } @@ -4170,7 +5635,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) if (!buf) goto out; - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + ptr = d_path(&mddev->bitmap->storage.file->f_path, + buf, sizeof(file->pathname)); if (IS_ERR(ptr)) goto out; @@ -4186,15 +5652,16 @@ out: return err; } -static int get_disk_info(mddev_t * mddev, void __user * arg) +static int get_disk_info(struct mddev * mddev, void __user * arg) { mdu_disk_info_t info; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -4213,6 +5680,7 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } + rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4220,10 +5688,10 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) return 0; } -static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) @@ -4240,9 +5708,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return PTR_ERR(rdev); } if (!list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); - int err = super_types[mddev->major_version] + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { printk(KERN_WARNING @@ -4283,21 +5752,33 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) PTR_ERR(rdev)); return PTR_ERR(rdev); } - /* set save_raid_disk if appropriate */ + /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && - info->raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; + if ((info->state & (1<<MD_DISK_SYNC)) && + rdev->raid_disk != info->raid_disk) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); @@ -4315,12 +5796,14 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); else - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!err) + md_new_event(mddev); md_wakeup_thread(mddev->thread); return err; } @@ -4336,7 +5819,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!(info->state & (1<<MD_DISK_FAULTY))) { int err; - rdev = md_import_device (dev, -1, 0); + rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", @@ -4358,10 +5841,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + } else + rdev->sb_start = calc_dev_sboffset(rdev); + rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4373,15 +5856,18 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return 0; } -static int hot_remove_disk(mddev_t * mddev, dev_t dev) +static int hot_remove_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -4396,11 +5882,11 @@ busy: return -EBUSY; } -static int hot_add_disk(mddev_t * mddev, dev_t dev) +static int hot_add_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; @@ -4418,7 +5904,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) return -EINVAL; } - rdev = md_import_device (dev, -1, 0); + rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", @@ -4427,11 +5913,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4452,13 +5938,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) * noticed in interrupt contexts ... */ - if (rdev->desc_nr == mddev->max_disks) { - printk(KERN_WARNING "%s: can not hot-add to full array!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unbind_export; - } - rdev->raid_disk = -1; md_update_sb(mddev, 1); @@ -4472,17 +5951,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) md_new_event(mddev); return 0; -abort_unbind_export: - unbind_rdev_from_array(rdev); - abort_export: export_rdev(rdev); return err; } -static int set_bitmap_file(mddev_t *mddev, int fd) +static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce) @@ -4494,32 +5970,47 @@ static int set_bitmap_file(mddev_t *mddev, int fd) if (fd >= 0) { + struct inode *inode; if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_file = fget(fd); + mddev->bitmap_info.file = fget(fd); - if (mddev->bitmap_file == NULL) { + if (mddev->bitmap_info.file == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_file); - if (err) { + inode = mddev->bitmap_info.file->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + printk(KERN_ERR "%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + err = -EBUSY; + } + if (err) { + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; return err; } - mddev->bitmap_offset = 0; /* file overrides offset */ + mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - if (fd >= 0) + if (fd >= 0) { err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } if (fd < 0 || err) { bitmap_destroy(mddev); fd = -1; /* make sure to put the file */ @@ -4527,11 +6018,9 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - } - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } return err; @@ -4550,7 +6039,7 @@ static int set_bitmap_file(mddev_t *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) { if (info->raid_disks == 0) { @@ -4568,6 +6057,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; mddev->persistent = !info->not_persistent; + /* ensure mddev_put doesn't delete this now that there + * is some minimal configuration. + */ + mddev->ctime = get_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; @@ -4577,7 +6070,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4590,7 +6083,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; - mddev->chunk_size = info->chunk_size; + mddev->chunk_sectors = info->chunk_size >> 9; mddev->max_disks = MD_SB_DISKS; @@ -4598,8 +6091,9 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; - mddev->bitmap_offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -4609,18 +6103,29 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; return 0; } -static int update_size(mddev_t *mddev, sector_t num_sectors) +void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) +{ + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + +static int update_size(struct mddev *mddev, sector_t num_sectors) { - mdk_rdev_t * rdev; + struct md_rdev *rdev; int rv; - struct list_head *tmp; int fit = (num_sectors == 0); if (mddev->pers->resize == NULL) @@ -4633,18 +6138,14 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. - */ if (mddev->sync_thread) return -EBUSY; - if (mddev->bitmap) - /* Sorry, cannot grow a bitmap yet, just remove it, - * grow, and re-add. - */ - return -EBUSY; - rdev_for_each(rdev, tmp, mddev) { - sector_t avail; - avail = rdev->size * 2; + if (mddev->ro) + return -EROFS; + + rdev_for_each(rdev, mddev) { + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -4652,35 +6153,46 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } -static int update_raid_disks(mddev_t *mddev, int raid_disks) +static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; + struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || - raid_disks >= mddev->max_disks) + (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + mddev->delta_disks = raid_disks - mddev->raid_disks; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + else if (mddev->delta_disks > 0) + mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); + if (rv < 0) { + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } return rv; } @@ -4693,14 +6205,14 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time. */ -static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) +static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; int state = 0; /* calculate expected state,ignoring low bits */ - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || @@ -4710,30 +6222,41 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size || + mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; - if (cnt == 0) return 0; - if (cnt > 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) + cnt++; + if (cnt == 0) + return 0; + if (cnt > 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout * we don't need to do anything at the md level, the * personality will take care of it all. */ - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; - else - return mddev->pers->reconfig(mddev, info->layout, -1); + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -4748,11 +6271,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* add the bitmap */ if (mddev->bitmap) return -EEXIST; - if (mddev->default_bitmap_offset == 0) + if (mddev->bitmap_info.default_offset == 0) return -EINVAL; - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -4760,31 +6288,37 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* remove the bitmap */ if (!mddev->bitmap) return -ENOENT; - if (mddev->bitmap->file) + if (mddev->bitmap->storage.file) return -EINVAL; mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); return rv; } -static int set_disk_faulty(mddev_t *mddev, dev_t dev) +static int set_disk_faulty(struct mddev *mddev, dev_t dev) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; + int err = 0; if (mddev->pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -4795,59 +6329,140 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - mddev_t *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = bdev->bd_disk->private_data; geo->heads = 2; geo->sectors = 4; - geo->cylinders = get_capacity(mddev->gendisk) / 8; + geo->cylinders = mddev->array_sectors / 8; return 0; } -static int md_ioctl(struct inode *inode, struct file *file, +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case BLKROSET: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case PRINT_RAID_DEBUG: + case RAID_AUTORUN: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + return true; + default: + return false; + } +} + +static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; - mddev_t *mddev = NULL; + struct mddev *mddev = NULL; + int ro; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } /* * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto done; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; #ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; #endif - default:; + default:; } /* * Commands creating/starting a new array: */ - mddev = inode->i_bdev->bd_disk->private_data; + mddev = bdev->bd_disk->private_data; if (!mddev) { BUG(); goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + + if (cmd == ADD_NEW_DISK) + /* need to ensure md_delayed_delete() has completed */ + flush_workqueue(md_misc_wq); + + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->flags), + msecs_to_jiffies(5000)); + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { + /* Need to flush page cache, and ensure no-one else opens + * and writes + */ + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > 1) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto abort; + } + set_bit(MD_STILL_CLOSED, &mddev->flags); + mutex_unlock(&mddev->open_mutex); + sync_blockdev(bdev); + } err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -4856,50 +6471,44 @@ static int md_ioctl(struct inode *inode, struct file *file, goto abort; } - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; } goto done_unlock; - - default:; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; } /* @@ -4918,32 +6527,73 @@ static int md_ioctl(struct inode *inode, struct file *file, /* * Commands even a read-only array can execute: */ - switch (cmd) - { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; - - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); + switch (cmd) { + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto done_unlock; + + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto done_unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<<MD_DISK_SYNC))) + /* Need to clear read-only for this */ + break; + else + err = add_new_disk(mddev, &info); goto done_unlock; + } + break; - case RESTART_ARRAY_RW: - err = restart_array(mddev); + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; goto done_unlock; + } + err = -EINVAL; - case STOP_ARRAY: - err = do_md_stop (mddev, 0, 1); + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) goto done_unlock; - case STOP_ARRAY_RO: - err = do_md_stop (mddev, 1, 1); + /* are we are already prepared for writes? */ + if (mddev->ro != 1) goto done_unlock; + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } + goto done_unlock; } /* @@ -4956,54 +6606,58 @@ static int md_ioctl(struct inode *inode, struct file *file, if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + mddev_lock_nointr(mddev); + } } else { err = -EROFS; goto abort_unlock; } } - switch (cmd) + switch (cmd) { + case ADD_NEW_DISK: { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } - - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case RUN_ARRAY: - err = do_md_run (mddev); - goto done_unlock; + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; - default: - err = -EINVAL; - goto abort_unlock; + default: + err = -EINVAL; + goto abort_unlock; } done_unlock: abort_unlock: + if (mddev->hold_active == UNTIL_IOCTL && + err != -EINVAL) + mddev->hold_active = 0; mddev_unlock(mddev); return err; @@ -5013,68 +6667,103 @@ done: abort: return err; } +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ -static int md_open(struct inode *inode, struct file *file) +static int md_open(struct block_device *bdev, fmode_t mode) { /* * Succeed if we can lock the mddev, which confirms that * it isn't being stopped right now. */ - mddev_t *mddev = inode->i_bdev->bd_disk->private_data; + struct mddev *mddev = mddev_find(bdev->bd_dev); int err; - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if (!mddev) + return -ENODEV; + + if (mddev->gendisk != bdev->bd_disk) { + /* we are racing with mddev_put which is discarding this + * bd_disk. + */ + mddev_put(mddev); + /* Wait until bdev->bd_disk is definitely gone */ + flush_workqueue(md_misc_wq); + /* Then retry the open from the top */ + return -ERESTARTSYS; + } + BUG_ON(mddev != bdev->bd_disk->private_data); + + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; - mddev_get(mddev); atomic_inc(&mddev->openers); - mddev_unlock(mddev); + clear_bit(MD_STILL_CLOSED, &mddev->flags); + mutex_unlock(&mddev->open_mutex); - check_disk_change(inode->i_bdev); + check_disk_change(bdev); out: return err; } -static int md_release(struct inode *inode, struct file * file) +static void md_release(struct gendisk *disk, fmode_t mode) { - mddev_t *mddev = inode->i_bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); - - return 0; } static int md_media_changed(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; return mddev->changed; } static int md_revalidate(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; mddev->changed = 0; return 0; } -static struct block_device_operations md_fops = +static const struct block_device_operations md_fops = { .owner = THIS_MODULE, .open = md_open, .release = md_release, .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif .getgeo = md_getgeo, - .media_changed = md_media_changed, + .media_changed = md_media_changed, .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) { - mdk_thread_t *thread = arg; + struct md_thread *thread = arg; /* * md_thread is a 'system-thread', it's priority should be very @@ -5106,28 +6795,28 @@ static int md_thread(void * arg) thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); - - thread->run(thread->mddev); + if (!kthread_should_stop()) + thread->run(thread); } return 0; } -void md_wakeup_thread(mdk_thread_t *thread) +void md_wakeup_thread(struct md_thread *thread) { if (thread) { - dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } } -mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { - mdk_thread_t *thread; + struct md_thread *thread; - thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); + thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); if (!thread) return NULL; @@ -5136,7 +6825,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); + thread->tsk = kthread_run(md_thread, thread, + "%s_%s", + mdname(thread->mddev), + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -5144,15 +6836,24 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, return thread; } -void md_unregister_thread(mdk_thread_t *thread) +void md_unregister_thread(struct md_thread **threadp) { - dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + struct md_thread *thread = *threadp; + if (!thread) + return; + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + /* Locking ensures that mddev_unlock does not wake_up a + * non-existent thread + */ + spin_lock(&pers_lock); + *threadp = NULL; + spin_unlock(&pers_lock); kthread_stop(thread->tsk); kfree(thread); } -void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +void md_error(struct mddev *mddev, struct md_rdev *rdev) { if (!mddev) { MD_BUG(); @@ -5162,26 +6863,17 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; - if (mddev->external) - set_bit(Blocked, &rdev->flags); -/* - dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", - mdname(mddev), - MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), - __builtin_return_address(0),__builtin_return_address(1), - __builtin_return_address(2),__builtin_return_address(3)); -*/ - if (!mddev->pers) - return; - if (!mddev->pers->error_handler) + if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(StateChanged, &rdev->flags); + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); md_new_event_inintr(mddev); } @@ -5190,12 +6882,11 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) static void status_unused(struct seq_file *seq) { int i = 0; - mdk_rdev_t *rdev; - struct list_head *tmp; + struct md_rdev *rdev; seq_printf(seq, "unused devices: "); - rdev_for_each_list(rdev, tmp, pending_raid_disks) { + list_for_each_entry(rdev, &pending_raid_disks, same_set) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, "%s ", @@ -5208,39 +6899,45 @@ static void status_unused(struct seq_file *seq) } -static void status_resync(struct seq_file *seq, mddev_t * mddev) +static void status_resync(struct seq_file *seq, struct mddev * mddev) { - sector_t max_blocks, resync, res; - unsigned long dt, db, rt; + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; int scale; unsigned int per_milli; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors >> 1; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size; + max_sectors = mddev->dev_sectors; /* * Should not happen. */ - if (!max_blocks) { + if (!max_sectors) { MD_BUG(); return; } /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_blocks>>scale) will fit in a + * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_blocks/2 > (1ULL<<(scale+32))) + while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; - sector_div(res, (u32)((max_blocks>>scale)+1)); + sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { @@ -5261,25 +6958,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, - (unsigned long long) resync, - (unsigned long long) max_blocks); + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - mddev->resync_mark_cnt; - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; - seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); } @@ -5288,7 +6995,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) { struct list_head *tmp; loff_t l = *pos; - mddev_t *mddev; + struct mddev *mddev; if (l >= 0x10000) return NULL; @@ -5299,7 +7006,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) spin_lock(&all_mddevs_lock); list_for_each(tmp,&all_mddevs) if (!l--) { - mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); spin_unlock(&all_mddevs_lock); return mddev; @@ -5313,7 +7020,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; - mddev_t *next_mddev, *mddev = v; + struct mddev *next_mddev, *mddev = v; ++*pos; if (v == (void*)2) @@ -5325,7 +7032,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) else tmp = mddev->all_mddevs.next; if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); else { next_mddev = (void*)2; *pos = 0x10000; @@ -5340,27 +7047,20 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void md_seq_stop(struct seq_file *seq, void *v) { - mddev_t *mddev = v; + struct mddev *mddev = v; if (mddev && v != (void*)1 && v != (void*)2) mddev_put(mddev); } -struct mdstat_info { - int event; -}; - static int md_seq_show(struct seq_file *seq, void *v) { - mddev_t *mddev = v; - sector_t size; - struct list_head *tmp2; - mdk_rdev_t *rdev; - struct mdstat_info *mi = seq->private; - struct bitmap *bitmap; + struct mddev *mddev = v; + sector_t sectors; + struct md_rdev *rdev; if (v == (void*)1) { - struct mdk_personality *pers; + struct md_personality *pers; seq_printf(seq, "Personalities : "); spin_lock(&pers_lock); list_for_each_entry(pers, &pers_list, list) @@ -5368,7 +7068,7 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&pers_lock); seq_printf(seq, "\n"); - mi->event = atomic_read(&md_event_count); + seq->poll_event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -5390,8 +7090,8 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; - rdev_for_each(rdev, tmp2, mddev) { + sectors = 0; + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -5400,9 +7100,12 @@ static int md_seq_show(struct seq_file *seq, void *v) if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; - } else if (rdev->raid_disk < 0) + } + if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5412,7 +7115,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || @@ -5428,13 +7131,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " super non-persistent"); if (mddev->pers) { - mddev->pers->status (seq, mddev); + mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { if (mddev->curr_resync > 2) { - status_resync (seq, mddev); + status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -5442,27 +7145,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = bitmap->chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -5471,7 +7154,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations md_seq_ops = { +static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, @@ -5480,34 +7163,32 @@ static struct seq_operations md_seq_ops = { static int md_seq_open(struct inode *inode, struct file *file) { + struct seq_file *seq; int error; - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); - if (mi == NULL) - return -ENOMEM; error = seq_open(file, &md_seq_ops); if (error) - kfree(mi); - else { - struct seq_file *p = file->private_data; - p->private = mi; - mi->event = atomic_read(&md_event_count); - } + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); return error; } +static int md_unloading; static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { - struct seq_file *m = filp->private_data; - struct mdstat_info *mi = m->private; + struct seq_file *seq = filp->private_data; int mask; + if (md_unloading) + return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ mask = POLLIN | POLLRDNORM; - if (mi->event != atomic_read(&md_event_count)) + if (seq->poll_event != atomic_read(&md_event_count)) mask |= POLLERR | POLLPRI; return mask; } @@ -5521,7 +7202,7 @@ static const struct file_operations md_seq_fops = { .poll = mdstat_poll, }; -int register_md_personality(struct mdk_personality *p) +int register_md_personality(struct md_personality *p) { spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); @@ -5530,7 +7211,7 @@ int register_md_personality(struct mdk_personality *p) return 0; } -int unregister_md_personality(struct mdk_personality *p) +int unregister_md_personality(struct md_personality *p) { printk(KERN_INFO "md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); @@ -5539,19 +7220,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(struct mddev *mddev, int init) { - mdk_rdev_t * rdev; + struct md_rdev * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = disk_stat_read(disk, sectors[0]) + - disk_stat_read(disk, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5574,7 +7255,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5583,13 +7264,14 @@ static int is_mddev_idle(mddev_t *mddev) return idle; } -void md_done_sync(mddev_t *mddev, int blocks, int ok) +void md_done_sync(struct mddev *mddev, int blocks, int ok) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -5601,7 +7283,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) * in superblock) before writing, schedule a superblock update * and wait for it to complete. */ -void md_write_start(mddev_t *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) @@ -5624,19 +7306,19 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock_irq(&mddev->write_lock); } if (did_change) - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } -void md_write_end(mddev_t *mddev) +void md_write_end(struct mddev *mddev) { if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) @@ -5655,7 +7337,7 @@ void md_write_end(mddev_t *mddev) * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(mddev_t *mddev) +int md_allow_write(struct mddev *mddev) { if (!mddev->pers) return 0; @@ -5668,16 +7350,17 @@ int md_allow_write(mddev_t *mddev) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock_irq(&mddev->write_lock); - if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; else return 0; @@ -5686,40 +7369,49 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(mddev_t *mddev) +#define UPDATE_FREQUENCY (5*60*HZ) +void md_do_sync(struct md_thread *thread) { - mddev_t *mddev2; + struct mddev *mddev = thread->mddev; + struct mddev *mddev2; unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; + unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; sector_t last_check; int skipped = 0; - struct list_head *rtmp; - mdk_rdev_t *rdev; - char *desc; + struct md_rdev *rdev; + char *desc, *action = NULL; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; - else + action = "repair"; + } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; + mddev->last_sync_action = action ?: desc; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -5740,10 +7432,8 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - } for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; @@ -5766,7 +7456,7 @@ void md_do_sync(mddev_t *mddev) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" @@ -5790,7 +7480,7 @@ void md_do_sync(mddev_t *mddev) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; @@ -5798,17 +7488,32 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->resync_max_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; - rdev_for_each(rdev, rtmp, mddev) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -5818,7 +7523,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -5833,36 +7538,71 @@ void md_do_sync(mddev_t *mddev) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", - window/2,(unsigned long long) max_sectors/2); + printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); + update_time = jiffies; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; skipped = 0; - if (j >= mddev->resync_max) { - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { + /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; + update_time = jiffies; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -5870,11 +7610,15 @@ void md_do_sync(mddev_t *mddev) atomic_add(sectors, &mddev->recovery_active); } + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) - /* this is the earliers that rebuilt will be + /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(mddev); @@ -5883,10 +7627,6 @@ void md_do_sync(mddev_t *mddev) continue; last_check = io_sectors; - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ @@ -5899,10 +7639,8 @@ void md_do_sync(mddev_t *mddev) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -5912,7 +7650,6 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - blk_unplug(mddev->queue); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -5920,19 +7657,19 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: - blk_unplug(mddev->queue); - + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ @@ -5946,95 +7683,106 @@ void md_do_sync(mddev_t *mddev) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rdev_for_each(rdev, rtmp, mddev) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); } } + skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); - skip: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); - -static int remove_and_add_spares(mddev_t *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { - mdk_rdev_t *rdev; - struct list_head *rtmp; + struct md_rdev *rdev; int spares = 0; + int removed = 0; - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk >= 0 && + rdev_for_each(rdev, mddev) + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; + removed++; } } + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (mddev->degraded && ! mddev->ro) { - rdev_for_each(rdev, rtmp, mddev) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Blocked, &rdev->flags)) - spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(mddev)); - spares++; - md_new_event(mddev); - } else - break; - } + if (this) + goto no_add; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; + + if (rdev->saved_raid_disk < 0) + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } +no_add: + if (removed) + set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; } + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -6057,20 +7805,13 @@ static int remove_and_add_spares(mddev_t *mddev) * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread. */ -void md_check_recovery(mddev_t *mddev) +void md_check_recovery(struct mddev *mddev) { - mdk_rdev_t *rdev; - struct list_head *rtmp; - + if (mddev->suspended) + return; if (mddev->bitmap) - bitmap_daemon_work(mddev->bitmap); - - if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) - sysfs_notify(&mddev->kobj, NULL, "array_state"); - - if (mddev->ro) - return; + bitmap_daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -6084,7 +7825,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags && !mddev->external) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -6097,10 +7838,19 @@ void md_check_recovery(mddev_t *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk */ - remove_and_add_spares(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -6114,24 +7864,18 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); if (did_change) - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - rdev_for_each(rdev, rtmp, mddev) - if (test_and_clear_bit(StateChanged, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "state"); - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ @@ -6139,60 +7883,38 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - rdev_for_each(rdev, rtmp, mddev) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); - md_new_event(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape(mddev) != 0) + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -6205,7 +7927,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; if (mddev->pers->sync_request) { - if (spares && mddev->bitmap && ! mddev->bitmap->file) { + if (spares) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. * So make sure all bitmap pages get written @@ -6214,66 +7936,627 @@ void md_check_recovery(mddev_t *mddev) } mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_resync"); + "resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); } unlock: + wake_up(&mddev->sb_wait); + if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); } mddev_unlock(mddev); } } -void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +void md_reap_sync_thread(struct mddev *mddev) { - sysfs_notify(&rdev->kobj, NULL, "state"); + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. + */ + if (!mddev->degraded) + rdev_for_each(rdev, mddev) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + +void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags), + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); -static int md_notify_reboot(struct notifier_block *this, - unsigned long code, void *x) +void md_finish_reshape(struct mddev *mddev) { - struct list_head *tmp; - mddev_t *mddev; + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); - printk(KERN_INFO "md: stopping all md devices.\n"); +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* This could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } - for_each_mddev(mddev, tmp) - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal */ - do_md_stop (mddev, 1, 100); - mddev_unlock(mddev); + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) { + /* No room for more */ + rv = 0; + break; + } else { + int this_sectors = sectors; + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} + +int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = md_set_badblocks(&rdev->badblocks, + s, sectors, 0); + if (rv) { + /* Make sure they get written out promptly */ + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + md_wakeup_thread(rdev->mddev->thread); + } + return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. */ - mdelay(1000*1); + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} + +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + return md_clear_badblocks(&rdev->badblocks, + s, sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ + unsigned long long sector; + int length; + char newline; +#ifdef DO_DEBUG + /* Allow clearing via sysfs *only* for testing/debugging. + * Normally only a successful write may clear a badblock + */ + int clear = 0; + if (page[0] == '-') { + clear = 1; + page++; + } +#endif /* DO_DEBUG */ + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + +#ifdef DO_DEBUG + if (clear) { + md_clear_badblocks(bb, sector, length); + return len; + } +#endif /* DO_DEBUG */ + if (md_set_badblocks(bb, sector, length, !unack)) + return len; + else + return -ENOSPC; +} + +static int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + struct mddev *mddev; + int need_delay = 0; + + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + if (mddev->pers) + __md_stop_writes(mddev); + if (mddev->persistent) + mddev->safemode = 2; + mddev_unlock(mddev); + } + need_delay = 1; + } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } @@ -6285,20 +8568,31 @@ static struct notifier_block md_notifier = { static void md_geninit(void) { - dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); } static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) - return -1; - if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); - return -1; - } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + goto err_md; + + if ((ret = register_blkdev(0, "mdp")) < 0) + goto err_mdp; + mdp_major = ret; + + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); @@ -6307,9 +8601,17 @@ static int __init md_init(void) raid_table_header = register_sysctl_table(raid_root_table); md_geninit(); - return (0); -} + return 0; +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} #ifndef MODULE @@ -6341,7 +8643,7 @@ void md_autodetect_dev(dev_t dev) static void autostart_arrays(int part) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; dev_t dev; int i_scanned, i_passed; @@ -6381,27 +8683,36 @@ static void autostart_arrays(int part) static __exit void md_exit(void) { - mddev_t *mddev; + struct mddev *mddev; struct list_head *tmp; + int delay = 1; - blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); - unregister_blkdev(MAJOR_NR,"md"); + unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { - struct gendisk *disk = mddev->gendisk; - if (!disk) - continue; export_array(mddev); - del_gendisk(disk); - put_disk(disk); - mddev->gendisk = NULL; - mddev_put(mddev); + mddev->hold_active = 0; } + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); } subsys_initcall(md_init); @@ -6425,6 +8736,7 @@ static int set_ro(const char *val, struct kernel_param *kp) module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); +module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); @@ -6436,6 +8748,8 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
