diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 3250 |
1 files changed, 2240 insertions, 1010 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 818313e277e..32fc19c540d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -36,14 +36,14 @@ #include <linux/blkdev.h> #include <linux/sysctl.h> #include <linux/seq_file.h> -#include <linux/mutex.h> -#include <linux/buffer_head.h> /* for invalidate_bdev */ +#include <linux/fs.h> #include <linux/poll.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> #include <linux/compat.h> @@ -54,13 +54,15 @@ #include "md.h" #include "bitmap.h" -#define DEBUG 0 -#define dprintk(x...) ((void)(DEBUG && printk(x))) - #ifndef MODULE static void autostart_arrays(int part); #endif +/* pers_list is a list of registered personalities protected + * by pers_lock. + * pers_lock does extra service to protect accesses to + * mddev->thread when the mutex cannot be held. + */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -70,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -93,13 +98,13 @@ static struct workqueue_struct *md_misc_wq; static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(mddev_t *mddev) +static inline int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(mddev_t *mddev) +static inline int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; @@ -107,7 +112,7 @@ static inline int speed_max(mddev_t *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, @@ -125,7 +130,7 @@ static ctl_table raid_table[] = { { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { .procname = "raid", .maxlen = 0, @@ -135,7 +140,7 @@ static ctl_table raid_dir_table[] = { { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { .procname = "dev", .maxlen = 0, @@ -153,65 +158,28 @@ static int start_readonly; * like bio_clone, but with a local bio set */ -static void mddev_bio_destructor(struct bio *bio) -{ - mddev_t *mddev, **mddevp; - - mddevp = (void*)bio; - mddev = mddevp[-1]; - - bio_free(bio, mddev->bio_set); -} - struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - mddev_t *mddev) + struct mddev *mddev) { struct bio *b; - mddev_t **mddevp; if (!mddev || !mddev->bio_set) return bio_alloc(gfp_mask, nr_iovecs); - b = bio_alloc_bioset(gfp_mask, nr_iovecs, - mddev->bio_set); + b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); if (!b) return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; return b; } EXPORT_SYMBOL_GPL(bio_alloc_mddev); struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - mddev_t *mddev) + struct mddev *mddev) { - struct bio *b; - mddev_t **mddevp; - if (!mddev || !mddev->bio_set) return bio_clone(bio, gfp_mask); - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, - mddev->bio_set); - if (!b) - return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; - __bio_clone(b, bio); - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; + return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); } EXPORT_SYMBOL_GPL(bio_clone_mddev); @@ -227,7 +195,7 @@ EXPORT_SYMBOL_GPL(bio_clone_mddev); */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -void md_new_event(mddev_t *mddev) +void md_new_event(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -237,7 +205,7 @@ EXPORT_SYMBOL_GPL(md_new_event); /* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */ -static void md_new_event_inintr(mddev_t *mddev) +static void md_new_event_inintr(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -258,19 +226,19 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define for_each_mddev(mddev,tmp) \ +#define for_each_mddev(_mddev,_tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ - tmp = all_mddevs.next; \ - mddev = NULL;}); \ - ({ if (tmp != &all_mddevs) \ - mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + _tmp = all_mddevs.next; \ + _mddev = NULL;}); \ + ({ if (_tmp != &all_mddevs) \ + mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ spin_unlock(&all_mddevs_lock); \ - if (mddev) mddev_put(mddev); \ - mddev = list_entry(tmp, mddev_t, all_mddevs); \ - tmp != &all_mddevs;}); \ + if (_mddev) mddev_put(_mddev); \ + _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ + _tmp != &all_mddevs;}); \ ({ spin_lock(&all_mddevs_lock); \ - tmp = tmp->next;}) \ + _tmp = _tmp->next;}) \ ) @@ -281,18 +249,21 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ -static int md_make_request(struct request_queue *q, struct bio *bio) +static void md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); - mddev_t *mddev = q->queuedata; - int rv; + struct mddev *mddev = q->queuedata; int cpu; unsigned int sectors; if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { bio_io_error(bio); - return 0; + return; + } + if (mddev->ro == 1 && unlikely(rw == WRITE)) { + bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + return; } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); @@ -317,7 +288,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio) * go away inside make_request */ sectors = bio_sectors(bio); - rv = mddev->pers->make_request(mddev, bio); + mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -326,8 +297,6 @@ static int md_make_request(struct request_queue *q, struct bio *bio) if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); - - return rv; } /* mddev_suspend makes sure no new requests are submitted @@ -336,25 +305,31 @@ static int md_make_request(struct request_queue *q, struct bio *bio) * Once ->stop is called and completes, the module will be completely * unused. */ -void mddev_suspend(mddev_t *mddev) +void mddev_suspend(struct mddev *mddev) { BUG_ON(mddev->suspended); mddev->suspended = 1; synchronize_rcu(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + + del_timer_sync(&mddev->safemode_timer); } EXPORT_SYMBOL_GPL(mddev_suspend); -void mddev_resume(mddev_t *mddev) +void mddev_resume(struct mddev *mddev) { mddev->suspended = 0; wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ } EXPORT_SYMBOL_GPL(mddev_resume); -int mddev_congested(mddev_t *mddev, int bits) +int mddev_congested(struct mddev *mddev, int bits) { return mddev->suspended; } @@ -366,8 +341,8 @@ EXPORT_SYMBOL(mddev_congested); static void md_end_flush(struct bio *bio, int err) { - mdk_rdev_t *rdev = bio->bi_private; - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; rdev_dec_pending(rdev, mddev); @@ -382,13 +357,13 @@ static void md_submit_flush_data(struct work_struct *ws); static void submit_flushes(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, flush_work); - mdk_rdev_t *rdev; + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct md_rdev *rdev; INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped @@ -399,7 +374,7 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; @@ -415,28 +390,27 @@ static void submit_flushes(struct work_struct *ws) static void md_submit_flush_data(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, flush_work); + struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; - if (bio->bi_size == 0) + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { bio->bi_rw &= ~REQ_FLUSH; - if (mddev->pers->make_request(mddev, bio)) - generic_make_request(bio); + mddev->pers->make_request(mddev, bio); } mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); } -void md_flush_request(mddev_t *mddev, struct bio *bio) +void md_flush_request(struct mddev *mddev, struct bio *bio) { spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock, /*nothing*/); + mddev->write_lock); mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); @@ -445,52 +419,15 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue - */ -static void plugger_work(struct work_struct *work) -{ - struct plug_handle *plug = - container_of(work, struct plug_handle, unplug_work); - plug->unplug_fn(plug); -} -static void plugger_timeout(unsigned long data) -{ - struct plug_handle *plug = (void *)data; - kblockd_schedule_work(NULL, &plug->unplug_work); -} -void plugger_init(struct plug_handle *plug, - void (*unplug_fn)(struct plug_handle *)) -{ - plug->unplug_flag = 0; - plug->unplug_fn = unplug_fn; - init_timer(&plug->unplug_timer); - plug->unplug_timer.function = plugger_timeout; - plug->unplug_timer.data = (unsigned long)plug; - INIT_WORK(&plug->unplug_work, plugger_work); -} -EXPORT_SYMBOL_GPL(plugger_init); - -void plugger_set_plug(struct plug_handle *plug) +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) { - if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) - mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); } -EXPORT_SYMBOL_GPL(plugger_set_plug); +EXPORT_SYMBOL(md_unplug); -int plugger_remove_plug(struct plug_handle *plug) -{ - if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { - del_timer(&plug->unplug_timer); - return 1; - } else - return 0; -} -EXPORT_SYMBOL_GPL(plugger_remove_plug); - - -static inline mddev_t *mddev_get(mddev_t *mddev) +static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); return mddev; @@ -498,7 +435,7 @@ static inline mddev_t *mddev_get(mddev_t *mddev) static void mddev_delayed_delete(struct work_struct *ws); -static void mddev_put(mddev_t *mddev) +static void mddev_put(struct mddev *mddev) { struct bio_set *bs = NULL; @@ -508,7 +445,7 @@ static void mddev_put(mddev_t *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del(&mddev->all_mddevs); + list_del_init(&mddev->all_mddevs); bs = mddev->bio_set; mddev->bio_set = NULL; if (mddev->gendisk) { @@ -527,7 +464,7 @@ static void mddev_put(mddev_t *mddev) bioset_free(bs); } -void mddev_init(mddev_t *mddev) +void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); @@ -543,15 +480,17 @@ void mddev_init(mddev_t *mddev) init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; + mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; } EXPORT_SYMBOL_GPL(mddev_init); -static mddev_t * mddev_find(dev_t unit) +static struct mddev * mddev_find(dev_t unit) { - mddev_t *mddev, *new = NULL; + struct mddev *mddev, *new = NULL; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1<<MdpMinorShift)-1); @@ -623,24 +562,32 @@ static mddev_t * mddev_find(dev_t unit) goto retry; } -static inline int mddev_lock(mddev_t * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(mddev_t *mddev) +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) { return mutex_is_locked(&mddev->reconfig_mutex); } -static inline int mddev_trylock(mddev_t * mddev) +static inline int mddev_trylock(struct mddev * mddev) { return mutex_trylock(&mddev->reconfig_mutex); } static struct attribute_group md_redundancy_group; -static void mddev_unlock(mddev_t * mddev) +static void mddev_unlock(struct mddev * mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -675,34 +622,61 @@ static void mddev_unlock(mddev_t * mddev) } else mutex_unlock(&mddev->reconfig_mutex); + /* As we've dropped the mutex we need a spinlock to + * make sure the thread doesn't disappear + */ + spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + spin_unlock(&pers_lock); } -static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; return NULL; } -static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} - list_for_each_entry(rdev, &mddev->disks, same_set) +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } -static struct mdk_personality *find_pers(int level, char *clevel) +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) { - struct mdk_personality *pers; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + +static struct md_personality *find_pers(int level, char *clevel) +{ + struct md_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; @@ -713,13 +687,13 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) +static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } -static int alloc_disk_sb(mdk_rdev_t * rdev) +static int alloc_disk_sb(struct md_rdev * rdev) { if (rdev->sb_page) MD_BUG(); @@ -733,7 +707,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) return 0; } -static void free_disk_sb(mdk_rdev_t * rdev) +void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); @@ -742,13 +716,19 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_start = 0; rdev->sectors = 0; } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; + } + kfree(rdev->badblocks.page); + rdev->badblocks.page = NULL; } - +EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio, int error) { - mdk_rdev_t *rdev = bio->bi_private; - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { printk("md: super_written gets error=%d, uptodate=%d\n", @@ -762,7 +742,7 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, +void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page) { /* write first size bytes of page to sector of rdev @@ -774,17 +754,16 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; atomic_inc(&mddev->pending_writes); - submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, - bio); + submit_bio(WRITE_FLUSH_FUA, bio); } -void md_super_wait(mddev_t *mddev) +void md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); @@ -797,32 +776,24 @@ void md_super_wait(mddev_t *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - -int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, +int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; int ret; - rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; if (metadata_op) - bio->bi_sector = sector + rdev->sb_start; + bio->bi_iter.bi_sector = sector + rdev->sb_start; + else if (rdev->mddev->reshape_position != MaxSector && + (rdev->mddev->reshape_backwards == + (sector >= rdev->mddev->reshape_position))) + bio->bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio->bi_sector = sector + rdev->data_offset; + bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); @@ -830,7 +801,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, } EXPORT_SYMBOL_GPL(sync_page_io); -static int read_disk_sb(mdk_rdev_t * rdev, int size) +static int read_disk_sb(struct md_rdev * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -937,7 +908,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: - * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: @@ -947,13 +918,13 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * - * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * - * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * @@ -962,12 +933,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) struct super_type { char *name; struct module *owner; - int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, int minor_version); - int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); - void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); - unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); + unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); }; /* @@ -978,7 +954,7 @@ struct super_type { * has a bitmap. Otherwise, it returns 0. * */ -int md_check_no_bitmap(mddev_t *mddev) +int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; @@ -991,7 +967,7 @@ EXPORT_SYMBOL(md_check_no_bitmap); /* * load_super for 0.90.0 */ -static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; @@ -1011,7 +987,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = -EINVAL; bdevname(rdev->bdev, b); - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -1039,7 +1015,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; + rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -1050,7 +1028,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 1; } else { __u64 ev1, ev2; - mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + mdp_super_t *refsb = page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { printk(KERN_WARNING "md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); @@ -1070,8 +1048,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 0; } rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * (not needed for Linear and RAID0 as metadata doesn't + * record this size) + */ + if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (2ULL << 32) - 2; - if (rdev->sectors < sb->size * 2 && sb->level > 1) + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -1082,15 +1066,16 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version /* * validate_super for 0.90.0 */ -static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) { mdp_disk_t *desc; - mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1105,10 +1090,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->dev_sectors = sb->size * 2; + mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* bitmap can use 60 K after the 4K superblocks */ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; @@ -1116,6 +1105,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; mddev->new_chunk_sectors = sb->new_chunk >> 9; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; @@ -1142,9 +1133,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_info.file == NULL) + mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; + } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling, except @@ -1160,6 +1154,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1175,6 +1171,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where @@ -1194,10 +1191,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* * sync_super for 0.90.0 */ -static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int next_spare = mddev->raid_disks; @@ -1216,7 +1213,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) rdev->sb_size = MD_SB_BYTES; - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -1268,7 +1265,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); @@ -1338,7 +1335,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) * rdev_size_change for 0.90.0 */ static unsigned long long -super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ @@ -1347,12 +1344,23 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); return num_sectors; } +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} /* * version 1 superblock @@ -1365,12 +1373,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; - int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; - for (i=0; size>=4; size -= 4 ) + for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) @@ -1381,11 +1388,14 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } -static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; + sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; @@ -1421,7 +1431,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (ret) return ret; - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || @@ -1440,9 +1450,18 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; @@ -1453,18 +1472,61 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; else rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + u64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, READ, true)) + return -EIO; + bbp = (u64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (md_set_badblocks(&rdev->badblocks, + sector, count, 1) == 0) + return -EINVAL; + } + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; + if (!refdev) { ret = 1; } else { __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = - (struct mdp_superblock_1*)page_address(refdev->sb_page); + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || @@ -1484,27 +1546,26 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else ret = 0; } - if (minor_version) - rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - - le64_to_cpu(sb->data_offset); - else - rdev->sectors = rdev->sb_start; - if (rdev->sectors < le64_to_cpu(sb->data_size)) + if (minor_version) { + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); + sectors -= rdev->data_offset; + } else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le64_to_cpu(sb->size) > rdev->sectors) - return -EINVAL; return ret; } -static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) { - struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1521,17 +1582,37 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* Default location for bitmap is 1K after superblock + * using 3K - total of 4K + */ mddev->bitmap_info.default_offset = 1024 >> 9; - + mddev->bitmap_info.default_space = (4096-1024) >> 9; + mddev->reshape_backwards = 0; + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_info.file == NULL ) + mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = (__s32)le32_to_cpu(sb->bitmap_offset); + /* Metadata doesn't record how much space is available. + * For 1.0, we assume we can use up to the superblock + * if before, else to 4K beyond superblock. + * For others, assume no change is possible. + */ + if (mddev->minor_version > 0) + mddev->bitmap_info.space = 0; + else if (mddev->bitmap_info.offset > 0) + mddev->bitmap_info.space = + 8 - mddev->bitmap_info.offset; + else + mddev->bitmap_info.space = + -mddev->bitmap_info.offset; + } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); @@ -1539,6 +1620,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + if (mddev->delta_disks < 0 || + (mddev->delta_disks == 0 && + (le32_to_cpu(sb->feature_map) + & MD_FEATURE_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; @@ -1562,6 +1648,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1582,36 +1670,40 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: + rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) + MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; } -static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); - memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1629,6 +1721,13 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); @@ -1640,7 +1739,13 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); @@ -1649,10 +1754,54 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + if (mddev->delta_disks == 0 && + mddev->reshape_backwards) + sb->feature_map + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } + } + + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + u64 *bbp = (u64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = p[i]; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + bbp[i] = cpu_to_le64(store_bb); + } + bb->changed = 0; + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + } } max_dev = 0; - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1669,7 +1818,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1685,12 +1834,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } static unsigned long long -super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; @@ -1710,7 +1861,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } - sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); @@ -1718,6 +1869,40 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) rdev->sb_page); md_super_wait(rdev->mddev); return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; } static struct super_type super_types[] = { @@ -1728,6 +1913,7 @@ static struct super_type super_types[] = { .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", @@ -1736,12 +1922,25 @@ static struct super_type super_types[] = { .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, }, }; -static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +static void sync_super(struct mddev *mddev, struct md_rdev *rdev) +{ + if (mddev->sync_super) { + mddev->sync_super(mddev, rdev); + return; + } + + BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); + + super_types[mddev->major_version].sync_super(mddev, rdev); +} + +static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) { - mdk_rdev_t *rdev, *rdev2; + struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) @@ -1764,26 +1963,20 @@ static LIST_HEAD(pending_raid_disks); * from the array. It only succeeds if all working and active component devices * are integrity capable with matching profiles. */ -int md_integrity_register(mddev_t *mddev) +int md_integrity_register(struct mddev *mddev) { - mdk_rdev_t *rdev, *reference = NULL; + struct md_rdev *rdev, *reference = NULL; if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (blk_get_integrity(mddev->gendisk)) - return 0; /* already registered */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register, or already is */ + rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->raid_disk < 0) continue; - /* - * If at least one rdev is not integrity capable, we can not - * enable data integrity for the md device. - */ - if (!bdev_get_integrity(rdev->bdev)) - return -EINVAL; if (!reference) { /* Use the first rdev as the reference */ reference = rdev; @@ -1794,6 +1987,8 @@ int md_integrity_register(mddev_t *mddev) rdev->bdev->bd_disk) < 0) return -EINVAL; } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; /* * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. @@ -1804,17 +1999,27 @@ int md_integrity_register(mddev_t *mddev) mdname(mddev)); return -EINVAL; } - printk(KERN_NOTICE "md: data integrity on %s enabled\n", - mdname(mddev)); + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + printk(KERN_ERR "md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } return 0; } EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; @@ -1828,7 +2033,7 @@ void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) } EXPORT_SYMBOL(md_integrity_add_rdev); -static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) { char b[BDEVNAME_SIZE]; struct kobject *ko; @@ -1896,7 +2101,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ - mddev->recovery_disabled = 0; + mddev->recovery_disabled++; return 0; @@ -1908,12 +2113,12 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) static void md_delayed_delete(struct work_struct *ws) { - mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); + struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); kobject_del(&rdev->kobj); kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(mdk_rdev_t * rdev) +static void unbind_rdev_from_array(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; if (!rdev->mddev) { @@ -1927,6 +2132,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); rdev->sysfs_state = NULL; + rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -1942,14 +2148,14 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) +static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - shared ? (mdk_rdev_t *)lock_rdev : rdev); + shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); @@ -1959,7 +2165,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) return err; } -static void unlock_rdev(mdk_rdev_t *rdev) +static void unlock_rdev(struct md_rdev *rdev) { struct block_device *bdev = rdev->bdev; rdev->bdev = NULL; @@ -1970,14 +2176,14 @@ static void unlock_rdev(mdk_rdev_t *rdev) void md_autodetect_dev(dev_t dev); -static void export_rdev(mdk_rdev_t * rdev) +static void export_rdev(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); if (rdev->mddev) MD_BUG(); - free_disk_sb(rdev); + md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -1986,17 +2192,17 @@ static void export_rdev(mdk_rdev_t * rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(mdk_rdev_t * rdev) +static void kick_rdev_from_array(struct md_rdev * rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); } -static void export_array(mddev_t *mddev) +static void export_array(struct mddev *mddev) { - mdk_rdev_t *rdev, *tmp; + struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -2090,7 +2296,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) ); } -static void print_rdev(mdk_rdev_t *rdev, int major_version) +static void print_rdev(struct md_rdev *rdev, int major_version) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", @@ -2101,10 +2307,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); switch (major_version) { case 0: - print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); + print_sb_90(page_address(rdev->sb_page)); break; case 1: - print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); + print_sb_1(page_address(rdev->sb_page)); break; } } else @@ -2114,8 +2320,8 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) static void md_print_devices(void) { struct list_head *tmp; - mdk_rdev_t *rdev; - mddev_t *mddev; + struct md_rdev *rdev; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk("\n"); @@ -2128,11 +2334,11 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); @@ -2140,7 +2346,7 @@ static void md_print_devices(void) } -static void sync_sbs(mddev_t * mddev, int nospares) +static void sync_sbs(struct mddev * mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already @@ -2148,8 +2354,8 @@ static void sync_sbs(mddev_t * mddev, int nospares) * (which would mean they aren't being marked as dirty * with the rest of the array) */ - mdk_rdev_t *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) { + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -2157,22 +2363,27 @@ static void sync_sbs(mddev_t * mddev, int nospares) /* Don't update this superblock */ rdev->sb_loaded = 2; } else { - super_types[mddev->major_version]. - sync_super(mddev, rdev); + sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } -static void md_update_sb(mddev_t * mddev, int force_change) +static void md_update_sb(struct mddev * mddev, int force_change) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; int sync_req; int nospares = 0; + int any_badblocks_changed = 0; + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return; + } repeat: /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -2183,8 +2394,19 @@ repeat: if (!mddev->persistent) { clear_bit(MD_CHANGE_CLEAN, &mddev->flags); clear_bit(MD_CHANGE_DEVS, &mddev->flags); - if (!mddev->external) + if (!mddev->external) { clear_bit(MD_CHANGE_PENDING, &mddev->flags); + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; + md_ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } wake_up(&mddev->sb_wait); return; } @@ -2240,34 +2462,47 @@ repeat: MD_BUG(); mddev->events --; } + + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); + } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); - dprintk(KERN_INFO - "md: updating %s RAID superblock on device (in sync %d)\n", - mdname(mddev),mddev->in_sync); + pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", + mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; - dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (test_bit(Faulty, &rdev->flags)) - dprintk("(skipping faulty "); - dprintk("%s ", bdevname(rdev->bdev,b)); if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); - dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sb_start); + pr_debug("md: (write) %s's sb offset: %llu\n", + bdevname(rdev->bdev, b), + (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } } else - dprintk(")\n"); + pr_debug("md: %s (skipping faulty)\n", + bdevname(rdev->bdev, b)); + if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ break; @@ -2288,6 +2523,15 @@ repeat: if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + + if (any_badblocks_changed) + md_ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } } /* words written to sysfs files may, or may not, be \n terminated. @@ -2312,17 +2556,18 @@ static int cmd_match(const char *cmd, const char *str) struct rdev_sysfs_entry { struct attribute attr; - ssize_t (*show)(mdk_rdev_t *, char *); - ssize_t (*store)(mdk_rdev_t *, const char *, size_t); + ssize_t (*show)(struct md_rdev *, char *); + ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t -state_show(mdk_rdev_t *rdev, char *page) +state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; - if (test_bit(Faulty, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags) || + rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } @@ -2334,7 +2579,9 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags)) { + if (test_bit(Blocked, &rdev->flags) || + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &rdev->flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } @@ -2343,30 +2590,50 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sspare", sep); sep = ","; } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + len += sprintf(page+len, "%swrite_error", sep); + sep = ","; + } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); } static ssize_t -state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write - * faulty - simulates and error + * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly - * blocked - sets the Blocked flag - * -blocked - clears the Blocked flag + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - err = 0; + if (test_bit(Faulty, &rdev->flags)) + err = 0; + else + err = -EBUSY; } else if (cmd_match(buf, "remove")) { if (rdev->raid_disk >= 0) err = -EBUSY; else { - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; kick_rdev_from_array(rdev); if (mddev->pers) md_update_sb(mddev, 1); @@ -2383,7 +2650,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); @@ -2392,6 +2667,53 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2401,13 +2723,13 @@ static struct rdev_sysfs_entry rdev_state = __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t -errors_show(mdk_rdev_t *rdev, char *page) +errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t -errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) +errors_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -2421,7 +2743,7 @@ static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t -slot_show(mdk_rdev_t *rdev, char *page) +slot_show(struct md_rdev *rdev, char *page) { if (rdev->raid_disk < 0) return sprintf(page, "none\n"); @@ -2430,11 +2752,10 @@ slot_show(mdk_rdev_t *rdev, char *page) } static ssize_t -slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) +slot_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; int err; - char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; @@ -2451,19 +2772,15 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ - if (rdev->mddev->pers->hot_add_disk == NULL) + if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev->raid_disk); - if (err) - return err; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&rdev->mddev->kobj, nm); - rdev->raid_disk = -1; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { - mdk_rdev_t *rdev2; /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ @@ -2477,10 +2794,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; - list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) - if (rdev2->raid_disk == slot) - return -EEXIST; - if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; @@ -2490,6 +2803,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { @@ -2497,8 +2812,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) + if (sysfs_link_rdev(rdev->mddev, rdev)) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { @@ -2520,17 +2834,16 @@ static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t -offset_show(mdk_rdev_t *rdev, char *page) +offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t -offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) +offset_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) + unsigned long long offset; + if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; @@ -2539,14 +2852,72 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) * can be sane */ return -EBUSY; rdev->data_offset = offset; + rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (kstrtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + static ssize_t -rdev_size_show(mdk_rdev_t *rdev, char *page) +rdev_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } @@ -2566,7 +2937,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) unsigned long long blocks; sector_t new; - if (strict_strtoull(buf, 10, &blocks) < 0) + if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) @@ -2581,14 +2952,16 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) } static ssize_t -rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) +rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) { - mddev_t *my_mddev = rdev->mddev; + struct mddev *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; sector_t sectors; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. @@ -2598,6 +2971,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (!sectors) sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ @@ -2609,16 +2985,16 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ - mddev_t *mddev; + struct mddev *mddev; int overlap = 0; struct list_head *tmp; mddev_unlock(my_mddev); for_each_mddev(mddev, tmp) { - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; - mddev_lock(mddev); - list_for_each_entry(rdev2, &mddev->disks, same_set) + mddev_lock_nointr(mddev); + rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && overlaps(rdev->data_offset, rdev->sectors, @@ -2633,7 +3009,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -2652,7 +3028,7 @@ static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); -static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) +static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; @@ -2663,13 +3039,13 @@ static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) return sprintf(page, "%llu\n", recovery_start); } -static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) +static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long recovery_start; if (cmd_match(buf, "none")) recovery_start = MaxSector; - else if (strict_strtoull(buf, 10, &recovery_start)) + else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && @@ -2687,21 +3063,57 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, + &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, NULL, }; static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct mddev *mddev = rdev->mddev; ssize_t rv; if (!entry->show) @@ -2723,9 +3135,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); ssize_t rv; - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; if (!entry->store) return -EIO; @@ -2744,7 +3156,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, static void rdev_free(struct kobject *ko) { - mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } static const struct sysfs_ops rdev_sysfs_ops = { @@ -2757,22 +3169,38 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; -void md_rdev_init(mdk_rdev_t *rdev) +int md_rdev_init(struct md_rdev *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_nsec = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + seqlock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -2785,11 +3213,11 @@ EXPORT_SYMBOL_GPL(md_rdev_init); * * a faulty rdev _never_ has rdev->sb set. */ -static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; sector_t size; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); @@ -2798,8 +3226,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - md_rdev_init(rdev); - if ((err = alloc_disk_sb(rdev))) + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) goto abort_free; err = lock_rdev(rdev, newdev, super_format == -2); @@ -2839,11 +3270,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return rdev; abort_free: - if (rdev->sb_page) { - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - } + if (rdev->bdev) + unlock_rdev(rdev); + md_rdev_clear(rdev); kfree(rdev); return ERR_PTR(err); } @@ -2853,14 +3282,14 @@ abort_free: */ -static void analyze_sbs(mddev_t * mddev) +static void analyze_sbs(struct mddev * mddev) { int i; - mdk_rdev_t *rdev, *freshest, *tmp; + struct md_rdev *rdev, *freshest, *tmp; char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2881,7 +3310,7 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { @@ -2956,13 +3385,13 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static void md_safemode_timeout(unsigned long data); static ssize_t -safe_delay_show(mddev_t *mddev, char *page) +safe_delay_show(struct mddev *mddev, char *page) { int msec = (mddev->safemode_delay*1000)/HZ; return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); } static ssize_t -safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; @@ -2975,7 +3404,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) mddev->safemode_delay = (msec*HZ)/1000; if (mddev->safemode_delay == 0) mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay) + if (mddev->safemode_delay < old_delay || old_delay == 0) md_safemode_timeout((unsigned long)mddev); } return len; @@ -2984,9 +3413,9 @@ static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t -level_show(mddev_t *mddev, char *page) +level_show(struct mddev *mddev, char *page) { - struct mdk_personality *p = mddev->pers; + struct md_personality *p = mddev->pers; if (p) return sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) @@ -2998,14 +3427,14 @@ level_show(mddev_t *mddev, char *page) } static ssize_t -level_store(mddev_t *mddev, const char *buf, size_t len) +level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; ssize_t rv = len; - struct mdk_personality *pers; + struct md_personality *pers; long level; void *priv; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (mddev->pers == NULL) { if (len == 0) @@ -3019,6 +3448,8 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->level = LEVEL_NONE; return rv; } + if (mddev->ro) + return -EROFS; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3044,7 +3475,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (clevel[len-1] == '\n') len--; clevel[len] = 0; - if (strict_strtol(clevel, 10, &level)) + if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) @@ -3070,7 +3501,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; } - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks @@ -3083,6 +3514,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); @@ -3100,7 +3532,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); } if (mddev->pers->sync_request != NULL && pers->sync_request == NULL) { @@ -3123,18 +3555,16 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->safemode = 0; } - list_for_each_entry(rdev, &mddev->disks, same_set) { - char nm[20]; + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) @@ -3143,11 +3573,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s after level change\n", - nm, mdname(mddev)); + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING "md: cannot register rd%d" + " for %s after level change\n", + rdev->raid_disk, mdname(mddev)); } } @@ -3159,6 +3588,8 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; if (mddev->pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync @@ -3166,11 +3597,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } + blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); - mddev_resume(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); return rv; @@ -3181,7 +3613,7 @@ __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t -layout_show(mddev_t *mddev, char *page) +layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && @@ -3192,7 +3624,7 @@ layout_show(mddev_t *mddev, char *page) } static ssize_t -layout_store(mddev_t *mddev, const char *buf, size_t len) +layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -3204,6 +3636,8 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) { @@ -3222,7 +3656,7 @@ __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t -raid_disks_show(mddev_t *mddev, char *page) +raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; @@ -3233,10 +3667,10 @@ raid_disks_show(mddev_t *mddev, char *page) return sprintf(page, "%d\n", mddev->raid_disks); } -static int update_raid_disks(mddev_t *mddev, int raid_disks); +static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t -raid_disks_store(mddev_t *mddev, const char *buf, size_t len) +raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; int rv = 0; @@ -3248,9 +3682,20 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } mddev->delta_disks = n - olddisks; mddev->raid_disks = n; + mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; return rv ? rv : len; @@ -3259,7 +3704,7 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t -chunk_size_show(mddev_t *mddev, char *page) +chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && mddev->chunk_sectors != mddev->new_chunk_sectors) @@ -3270,7 +3715,7 @@ chunk_size_show(mddev_t *mddev, char *page) } static ssize_t -chunk_size_store(mddev_t *mddev, const char *buf, size_t len) +chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -3282,6 +3727,8 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) { @@ -3299,7 +3746,7 @@ static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t -resync_start_show(mddev_t *mddev, char *page) +resync_start_show(struct mddev *mddev, char *page) { if (mddev->recovery_cp == MaxSector) return sprintf(page, "none\n"); @@ -3307,12 +3754,12 @@ resync_start_show(mddev_t *mddev, char *page) } static ssize_t -resync_start_store(mddev_t *mddev, const char *buf, size_t len) +resync_start_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long n = simple_strtoull(buf, &e, 10); - if (mddev->pers) + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return -EBUSY; if (cmd_match(buf, "none")) n = MaxSector; @@ -3320,6 +3767,8 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = @@ -3377,7 +3826,7 @@ static int match_word(const char *word, char **list) } static ssize_t -array_state_show(mddev_t *mddev, char *page) +array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; @@ -3410,13 +3859,13 @@ array_state_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(mddev_t * mddev, int ro, int is_open); -static int md_set_readonly(mddev_t * mddev, int is_open); -static int do_md_run(mddev_t * mddev); -static int restart_array(mddev_t *mddev); +static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); +static int do_md_run(struct mddev * mddev); +static int restart_array(struct mddev *mddev); static ssize_t -array_state_store(mddev_t *mddev, const char *buf, size_t len) +array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = -EINVAL; enum array_state st = match_word(buf, array_states); @@ -3425,24 +3874,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 0, 0); + err = do_md_stop(mddev, 0, NULL); break; case inactive: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 2, 0); - } else + if (mddev->pers) + err = do_md_stop(mddev, 2, NULL); + else err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ case readonly: if (mddev->pers) - err = md_set_readonly(mddev, 0); + err = md_set_readonly(mddev, NULL); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -3452,7 +3897,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro == 0) - err = md_set_readonly(mddev, 0); + err = md_set_readonly(mddev, NULL); else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { @@ -3502,6 +3947,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } @@ -3510,13 +3957,13 @@ static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t -max_corrected_read_errors_show(mddev_t *mddev, char *page) { +max_corrected_read_errors_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", atomic_read(&mddev->max_corr_read_errors)); } static ssize_t -max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) +max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -3533,13 +3980,13 @@ __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, max_corrected_read_errors_store); static ssize_t -null_show(mddev_t *mddev, char *page) +null_show(struct mddev *mddev, char *page) { return -EINVAL; } static ssize_t -new_dev_store(mddev_t *mddev, const char *buf, size_t len) +new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. @@ -3552,7 +3999,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; - mdk_rdev_t *rdev; + struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') @@ -3570,8 +4017,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) @@ -3595,7 +4043,7 @@ static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t -bitmap_store(mddev_t *mddev, const char *buf, size_t len) +bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; @@ -3624,16 +4072,16 @@ static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t -size_show(mddev_t *mddev, char *page) +size_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->dev_sectors / 2); } -static int update_size(mddev_t *mddev, sector_t num_sectors); +static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t -size_store(mddev_t *mddev, const char *buf, size_t len) +size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). @@ -3661,14 +4109,14 @@ static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); -/* Metdata version. +/* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t -metadata_show(mddev_t *mddev, char *page) +metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", @@ -3680,7 +4128,7 @@ metadata_show(mddev_t *mddev, char *page) } static ssize_t -metadata_store(mddev_t *mddev, const char *buf, size_t len) +metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; @@ -3734,7 +4182,7 @@ static struct md_sysfs_entry md_metadata = __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t -action_show(mddev_t *mddev, char *page) +action_show(struct mddev *mddev, char *page) { char *type = "idle"; if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) @@ -3756,10 +4204,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } -static void reap_sync_thread(mddev_t *mddev); - static ssize_t -action_store(mddev_t *mddev, const char *page, size_t len) +action_store(struct mddev *mddev, const char *page, size_t len) { if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; @@ -3772,7 +4218,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3798,34 +4244,49 @@ action_store(mddev_t *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + static ssize_t -mismatch_cnt_show(mddev_t *mddev, char *page) +last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + return sprintf(page, "%s\n", mddev->last_sync_action); } -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); +static ssize_t +mismatch_cnt_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); +} static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t -sync_min_show(mddev_t *mddev, char *page) +sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local": "system"); } static ssize_t -sync_min_store(mddev_t *mddev, const char *buf, size_t len) +sync_min_store(struct mddev *mddev, const char *buf, size_t len) { int min; char *e; @@ -3844,14 +4305,14 @@ static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t -sync_max_show(mddev_t *mddev, char *page) +sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local": "system"); } static ssize_t -sync_max_store(mddev_t *mddev, const char *buf, size_t len) +sync_max_store(struct mddev *mddev, const char *buf, size_t len) { int max; char *e; @@ -3870,24 +4331,24 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t -degraded_show(mddev_t *mddev, char *page) +degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t -sync_force_parallel_show(mddev_t *mddev, char *page) +sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t -sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) +sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; - if (strict_strtol(buf, 10, &n)) + if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) @@ -3907,7 +4368,7 @@ __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t -sync_speed_show(mddev_t *mddev, char *page) +sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; if (mddev->curr_resync == 0) @@ -3922,14 +4383,19 @@ sync_speed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t -sync_completed_show(mddev_t *mddev, char *page) +sync_completed_show(struct mddev *mddev, char *page) { unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; @@ -3941,16 +4407,16 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t -min_sync_show(mddev_t *mddev, char *page) +min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t -min_sync_store(mddev_t *mddev, const char *buf, size_t len) +min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; - if (strict_strtoull(buf, 10, &min)) + if (kstrtoull(buf, 10, &min)) return -EINVAL; if (min > mddev->resync_max) return -EINVAL; @@ -3972,7 +4438,7 @@ static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t -max_sync_show(mddev_t *mddev, char *page) +max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); @@ -3981,13 +4447,13 @@ max_sync_show(mddev_t *mddev, char *page) (unsigned long long)mddev->resync_max); } static ssize_t -max_sync_store(mddev_t *mddev, const char *buf, size_t len) +max_sync_store(struct mddev *mddev, const char *buf, size_t len) { if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; - if (strict_strtoull(buf, 10, &max)) + if (kstrtoull(buf, 10, &max)) return -EINVAL; if (max < mddev->resync_min) return -EINVAL; @@ -4012,13 +4478,13 @@ static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t -suspend_lo_show(mddev_t *mddev, char *page) +suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); } static ssize_t -suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) +suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); @@ -4046,13 +4512,13 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t -suspend_hi_show(mddev_t *mddev, char *page) +suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); } static ssize_t -suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) +suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); @@ -4079,7 +4545,7 @@ static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t -reshape_position_show(mddev_t *mddev, char *page) +reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", @@ -4089,8 +4555,9 @@ reshape_position_show(mddev_t *mddev, char *page) } static ssize_t -reshape_position_store(mddev_t *mddev, const char *buf, size_t len) +reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { + struct md_rdev *rdev; char *e; unsigned long long new = simple_strtoull(buf, &e, 10); if (mddev->pers) @@ -4099,9 +4566,12 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; mddev->reshape_position = new; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; return len; } @@ -4110,7 +4580,43 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); static ssize_t -array_size_show(mddev_t *mddev, char *page) +reshape_direction_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", + mddev->reshape_backwards ? "backwards" : "forwards"); +} + +static ssize_t +reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) +{ + int backwards = 0; + if (cmd_match(buf, "forwards")) + backwards = 0; + else if (cmd_match(buf, "backwards")) + backwards = 1; + else + return -EINVAL; + if (mddev->reshape_backwards == backwards) + return len; + + /* check if we are allowed to change */ + if (mddev->delta_disks) + return -EBUSY; + + if (mddev->persistent && + mddev->major_version == 0) + return -EINVAL; + + mddev->reshape_backwards = backwards; + return len; +} + +static struct md_sysfs_entry md_reshape_direction = +__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, + reshape_direction_store); + +static ssize_t +array_size_show(struct mddev *mddev, char *page) { if (mddev->external_size) return sprintf(page, "%llu\n", @@ -4120,7 +4626,7 @@ array_size_show(mddev_t *mddev, char *page) } static ssize_t -array_size_store(mddev_t *mddev, const char *buf, size_t len) +array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; @@ -4164,6 +4670,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, NULL, @@ -4171,6 +4678,7 @@ static struct attribute *md_default_attrs[] = { static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, + &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, @@ -4195,16 +4703,25 @@ static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + rv = mddev_lock(mddev); if (!rv) { rv = entry->show(mddev, page); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } @@ -4213,26 +4730,34 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + if (entry->store == new_dev_store) + flush_workqueue(md_misc_wq); rv = mddev_lock(mddev); - if (mddev->hold_active == UNTIL_IOCTL) - mddev->hold_active = 0; if (!rv) { rv = entry->store(mddev, page, length); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } static void md_free(struct kobject *ko) { - mddev_t *mddev = container_of(ko, mddev_t, kobj); + struct mddev *mddev = container_of(ko, struct mddev, kobj); if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); @@ -4261,7 +4786,7 @@ int mdp_major = 0; static void mddev_delayed_delete(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, del_work); + struct mddev *mddev = container_of(ws, struct mddev, del_work); sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); @@ -4271,7 +4796,7 @@ static void mddev_delayed_delete(struct work_struct *ws) static int md_alloc(dev_t dev, char *name) { static DEFINE_MUTEX(disks_mutex); - mddev_t *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find(dev); struct gendisk *disk; int partitioned; int shift; @@ -4298,7 +4823,7 @@ static int md_alloc(dev_t dev, char *name) if (name) { /* Need to ensure that 'name' is not a duplicate. */ - mddev_t *mddev2; + struct mddev *mddev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) @@ -4317,6 +4842,7 @@ static int md_alloc(dev_t dev, char *name) mddev->queue->queuedata = mddev; blk_queue_make_request(mddev->queue, md_make_request); + blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); if (!disk) { @@ -4335,13 +4861,19 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. */ disk->flags |= GENHD_FL_EXT_DEVT; - add_disk(disk); mddev->gendisk = disk; + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { @@ -4355,8 +4887,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); - - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { @@ -4394,7 +4925,7 @@ static int add_named_array(const char *val, struct kernel_param *kp) static void md_safemode_timeout(unsigned long data) { - mddev_t *mddev = (mddev_t *) data; + struct mddev *mddev = (struct mddev *) data; if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; @@ -4406,11 +4937,11 @@ static void md_safemode_timeout(unsigned long data) static int start_dirty_degraded; -int md_run(mddev_t *mddev) +int md_run(struct mddev *mddev) { int err; - mdk_rdev_t *rdev; - struct mdk_personality *pers; + struct md_rdev *rdev; + struct md_personality *pers; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -4441,7 +4972,7 @@ int md_run(mddev_t *mddev) * the only valid external interface is through the md * device. */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -4473,7 +5004,7 @@ int md_run(mddev_t *mddev) } if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -4508,11 +5039,11 @@ int md_run(mddev_t *mddev) * configuration. */ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int warned = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -4555,7 +5086,8 @@ int md_run(mddev_t *mddev) err = -EINVAL; mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request) { + if (err == 0 && mddev->pers->sync_request && + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -4589,22 +5121,16 @@ int md_run(mddev_t *mddev) mddev->in_sync = 1; smp_wmb(); mddev->ready = 1; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - md_new_event(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); @@ -4613,7 +5139,7 @@ int md_run(mddev_t *mddev) } EXPORT_SYMBOL_GPL(md_run); -static int do_md_run(mddev_t *mddev) +static int do_md_run(struct mddev *mddev) { int err; @@ -4625,6 +5151,10 @@ static int do_md_run(mddev_t *mddev) bitmap_destroy(mddev); goto out; } + + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); mddev->changed = 1; @@ -4633,7 +5163,7 @@ out: return err; } -static int restart_array(mddev_t *mddev) +static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; @@ -4657,33 +5187,7 @@ static int restart_array(mddev_t *mddev) return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - -void restore_bitmap_write_access(struct file *file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); -} - -static void md_clean(mddev_t *mddev) +static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; mddev->external_size = 0; @@ -4707,11 +5211,12 @@ static void md_clean(mddev_t *mddev) mddev->events = 0; mddev->can_decrease_events = 0; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -4719,20 +5224,21 @@ static void md_clean(mddev_t *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; - mddev->plug = NULL; } -static void __md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(struct mddev *mddev) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -4740,22 +5246,23 @@ static void __md_stop_writes(mddev_t *mddev) bitmap_flush(mddev); md_super_wait(mddev); - if (!mddev->in_sync || mddev->flags) { + if (mddev->ro == 0 && + (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev, 1); } } -void md_stop_writes(mddev_t *mddev) +void md_stop_writes(struct mddev *mddev) { - mddev_lock(mddev); + mddev_lock_nointr(mddev); __md_stop_writes(mddev); mddev_unlock(mddev); } EXPORT_SYMBOL_GPL(md_stop_writes); -void md_stop(mddev_t *mddev) +static void __md_stop(struct mddev *mddev) { mddev->ready = 0; mddev->pers->stop(mddev); @@ -4765,14 +5272,49 @@ void md_stop(mddev_t *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + EXPORT_SYMBOL_GPL(md_stop); -static int md_set_readonly(mddev_t *mddev, int is_open) +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } err = -EBUSY; goto out; } @@ -4786,7 +5328,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; + err = 0; } out: mutex_unlock(&mddev->open_mutex); @@ -4797,38 +5339,56 @@ out: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ -static int do_md_stop(mddev_t * mddev, int mode, int is_open) +static int do_md_stop(struct mddev * mddev, int mode, + struct block_device *bdev) { struct gendisk *disk = mddev->gendisk; - mdk_rdev_t *rdev; + struct md_rdev *rdev; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open || - mddev->sysfs_active) { + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } - if (mddev->pers) { if (mddev->ro) set_disk_ro(disk, 0); __md_stop_writes(mddev); - md_stop(mddev); + __md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); set_capacity(disk, 0); mutex_unlock(&mddev->open_mutex); @@ -4847,7 +5407,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; } @@ -4867,9 +5426,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } #ifndef MODULE -static void autorun_array(mddev_t *mddev) +static void autorun_array(struct mddev *mddev) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; int err; if (list_empty(&mddev->disks)) @@ -4877,7 +5436,7 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -4886,7 +5445,7 @@ static void autorun_array(mddev_t *mddev) err = do_md_run(mddev); if (err) { printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop(mddev, 0, 0); + do_md_stop(mddev, 0, NULL); } } @@ -4904,8 +5463,8 @@ static void autorun_array(mddev_t *mddev) */ static void autorun_devices(int part) { - mdk_rdev_t *rdev0, *rdev, *tmp; - mddev_t *mddev; + struct md_rdev *rdev0, *rdev, *tmp; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: autorun ...\n"); @@ -4914,7 +5473,7 @@ static void autorun_devices(int part) dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, - mdk_rdev_t, same_set); + struct md_rdev, same_set); printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); @@ -5000,14 +5559,15 @@ static int get_version(void __user * arg) return 0; } -static int get_array_info(mddev_t * mddev, void __user * arg) +static int get_array_info(struct mddev * mddev, void __user * arg) { mdu_array_info_t info; int nr,working,insync,failed,spare; - mdk_rdev_t *rdev; + struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5019,6 +5579,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5038,7 +5599,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); + info.state |= (1<<MD_SB_BITMAP_PRESENT); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -5053,22 +5614,19 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } -static int get_bitmap_file(mddev_t * mddev, void __user * arg) +static int get_bitmap_file(struct mddev * mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; int err = -ENOMEM; - if (md_allow_write(mddev)) - file = kmalloc(sizeof(*file), GFP_NOIO); - else - file = kmalloc(sizeof(*file), GFP_KERNEL); + file = kmalloc(sizeof(*file), GFP_NOIO); if (!file) goto out; /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->file) { + if (!mddev->bitmap || !mddev->bitmap->storage.file) { file->pathname[0] = '\0'; goto copy_out; } @@ -5077,7 +5635,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) if (!buf) goto out; - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + ptr = d_path(&mddev->bitmap->storage.file->f_path, + buf, sizeof(file->pathname)); if (IS_ERR(ptr)) goto out; @@ -5093,15 +5652,16 @@ out: return err; } -static int get_disk_info(mddev_t * mddev, void __user * arg) +static int get_disk_info(struct mddev * mddev, void __user * arg) { mdu_disk_info_t info; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5120,6 +5680,7 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } + rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -5127,10 +5688,10 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) return 0; } -static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) @@ -5147,8 +5708,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return PTR_ERR(rdev); } if (!list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { @@ -5196,15 +5758,21 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = rdev->raid_disk; - else - rdev->saved_raid_disk = -1; + if ((info->state & (1<<MD_DISK_SYNC)) && + rdev->raid_disk != info->raid_disk) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) @@ -5230,10 +5798,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!err) + md_new_event(mddev); md_wakeup_thread(mddev->thread); return err; } @@ -5286,15 +5856,18 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return 0; } -static int hot_remove_disk(mddev_t * mddev, dev_t dev) +static int hot_remove_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -5309,11 +5882,11 @@ busy: return -EBUSY; } -static int hot_add_disk(mddev_t * mddev, dev_t dev) +static int hot_add_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; @@ -5383,9 +5956,9 @@ abort_export: return err; } -static int set_bitmap_file(mddev_t *mddev, int fd) +static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce) @@ -5397,6 +5970,7 @@ static int set_bitmap_file(mddev_t *mddev, int fd) if (fd >= 0) { + struct inode *inode; if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ mddev->bitmap_info.file = fget(fd); @@ -5407,10 +5981,21 @@ static int set_bitmap_file(mddev_t *mddev, int fd) return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_info.file); - if (err) { + inode = mddev->bitmap_info.file->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + printk(KERN_ERR "%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); + err = -EBUSY; + } + if (err) { fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; return err; @@ -5433,10 +6018,8 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); + if (mddev->bitmap_info.file) fput(mddev->bitmap_info.file); - } mddev->bitmap_info.file = NULL; } @@ -5456,7 +6039,7 @@ static int set_bitmap_file(mddev_t *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) { if (info->raid_disks == 0) { @@ -5509,6 +6092,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -5522,11 +6106,12 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; return 0; } -void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) +void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); @@ -5537,9 +6122,9 @@ void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) } EXPORT_SYMBOL(md_set_array_sectors); -static int update_size(mddev_t *mddev, sector_t num_sectors) +static int update_size(struct mddev *mddev, sector_t num_sectors) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; int rv; int fit = (num_sectors == 0); @@ -5556,12 +6141,10 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; - if (mddev->bitmap) - /* Sorry, cannot grow a bitmap yet, just remove it, - * grow, and re-add. - */ - return -EBUSY; - list_for_each_entry(rdev, &mddev->disks, same_set) { + if (mddev->ro) + return -EROFS; + + rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) @@ -5575,22 +6158,41 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return rv; } -static int update_raid_disks(mddev_t *mddev, int raid_disks) +static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; + struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + mddev->delta_disks = raid_disks - mddev->raid_disks; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + else if (mddev->delta_disks > 0) + mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); - if (rv < 0) + if (rv < 0) { mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } return rv; } @@ -5603,7 +6205,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time. */ -static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) +static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; @@ -5673,6 +6275,8 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) return -EINVAL; mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); if (!rv) @@ -5684,7 +6288,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* remove the bitmap */ if (!mddev->bitmap) return -ENOENT; - if (mddev->bitmap->file) + if (mddev->bitmap->storage.file) return -EINVAL; mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); @@ -5696,19 +6300,25 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) return rv; } -static int set_disk_faulty(mddev_t *mddev, dev_t dev) +static int set_disk_faulty(struct mddev *mddev, dev_t dev) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; + int err = 0; if (mddev->pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -5719,7 +6329,7 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - mddev_t *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = bdev->bd_disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -5727,39 +6337,74 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case BLKROSET: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case PRINT_RAID_DEBUG: + case RAID_AUTORUN: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + return true; + default: + return false; + } +} + static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; - mddev_t *mddev = NULL; + struct mddev *mddev = NULL; int ro; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } /* * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto done; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; #ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; #endif - default:; + default:; } /* @@ -5773,6 +6418,51 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + + if (cmd == ADD_NEW_DISK) + /* need to ensure md_delayed_delete() has completed */ + flush_workqueue(md_misc_wq); + + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->flags), + msecs_to_jiffies(5000)); + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { + /* Need to flush page cache, and ensure no-one else opens + * and writes + */ + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > 1) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto abort; + } + set_bit(MD_STILL_CLOSED, &mddev->flags); + mutex_unlock(&mddev->open_mutex); + sync_blockdev(bdev); + } err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -5781,50 +6471,44 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; } goto done_unlock; - - default:; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; } /* @@ -5843,60 +6527,73 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* * Commands even a read-only array can execute: */ - switch (cmd) - { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; + switch (cmd) { + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto done_unlock; - case RESTART_ARRAY_RW: - err = restart_array(mddev); - goto done_unlock; + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto done_unlock; - case STOP_ARRAY: - err = do_md_stop(mddev, 0, 1); - goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case STOP_ARRAY_RO: - err = md_set_readonly(mddev, 1); + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<<MD_DISK_SYNC))) + /* Need to clear read-only for this */ + break; + else + err = add_new_disk(mddev, &info); goto done_unlock; + } + break; - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto done_unlock; - } - err = -EINVAL; + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto done_unlock; + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto done_unlock; + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); } - goto done_unlock; + } + goto done_unlock; } /* @@ -5911,48 +6608,49 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, mddev->ro = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + mddev_lock_nointr(mddev); + } } else { err = -EROFS; goto abort_unlock; } } - switch (cmd) + switch (cmd) { + case ADD_NEW_DISK: { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } - - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case RUN_ARRAY: - err = do_md_run(mddev); - goto done_unlock; + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; - default: - err = -EINVAL; - goto abort_unlock; + default: + err = -EINVAL; + goto abort_unlock; } done_unlock: @@ -5995,9 +6693,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) * Succeed if we can lock the mddev, which confirms that * it isn't being stopped right now. */ - mddev_t *mddev = mddev_find(bdev->bd_dev); + struct mddev *mddev = mddev_find(bdev->bd_dev); int err; + if (!mddev) + return -ENODEV; + if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. @@ -6015,6 +6716,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) err = 0; atomic_inc(&mddev->openers); + clear_bit(MD_STILL_CLOSED, &mddev->flags); mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); @@ -6022,27 +6724,25 @@ static int md_open(struct block_device *bdev, fmode_t mode) return err; } -static int md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk, fmode_t mode) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); - - return 0; } static int md_media_changed(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; return mddev->changed; } static int md_revalidate(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; mddev->changed = 0; return 0; @@ -6063,7 +6763,7 @@ static const struct block_device_operations md_fops = static int md_thread(void * arg) { - mdk_thread_t *thread = arg; + struct md_thread *thread = arg; /* * md_thread is a 'system-thread', it's priority should be very @@ -6096,27 +6796,27 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; } -void md_wakeup_thread(mdk_thread_t *thread) +void md_wakeup_thread(struct md_thread *thread) { if (thread) { - dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } } -mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { - mdk_thread_t *thread; + struct md_thread *thread; - thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); + thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); if (!thread) return NULL; @@ -6128,7 +6828,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), - name ?: mddev->pers->name); + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -6136,17 +6836,24 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, return thread; } -void md_unregister_thread(mdk_thread_t *thread) +void md_unregister_thread(struct md_thread **threadp) { + struct md_thread *thread = *threadp; if (!thread) return; - dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + /* Locking ensures that mddev_unlock does not wake_up a + * non-existent thread + */ + spin_lock(&pers_lock); + *threadp = NULL; + spin_unlock(&pers_lock); kthread_stop(thread->tsk); kfree(thread); } -void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +void md_error(struct mddev *mddev, struct md_rdev *rdev) { if (!mddev) { MD_BUG(); @@ -6156,18 +6863,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; - if (mddev->external) - set_bit(Blocked, &rdev->flags); -/* - dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", - mdname(mddev), - MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), - __builtin_return_address(0),__builtin_return_address(1), - __builtin_return_address(2),__builtin_return_address(3)); -*/ - if (!mddev->pers) - return; - if (!mddev->pers->error_handler) + if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) @@ -6186,7 +6882,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) static void status_unused(struct seq_file *seq) { int i = 0; - mdk_rdev_t *rdev; + struct md_rdev *rdev; seq_printf(seq, "unused devices: "); @@ -6203,7 +6899,7 @@ static void status_unused(struct seq_file *seq) } -static void status_resync(struct seq_file *seq, mddev_t * mddev) +static void status_resync(struct seq_file *seq, struct mddev * mddev) { sector_t max_sectors, resync, res; unsigned long dt, db; @@ -6211,9 +6907,14 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) int scale; unsigned int per_milli; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; @@ -6268,7 +6969,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) * rt is a sector_t, so could be 32bit or 64bit. * So we divide before multiply in case it is 32bit and close * to the limit. - * We scale the divisor (db) by 32 to avoid loosing precision + * We scale the divisor (db) by 32 to avoid losing precision * near the end of resync when the number of remaining sectors * is close to 'db'. * We then divide rt by 32 after multiplying by db to compensate. @@ -6294,7 +6995,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) { struct list_head *tmp; loff_t l = *pos; - mddev_t *mddev; + struct mddev *mddev; if (l >= 0x10000) return NULL; @@ -6305,7 +7006,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) spin_lock(&all_mddevs_lock); list_for_each(tmp,&all_mddevs) if (!l--) { - mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); spin_unlock(&all_mddevs_lock); return mddev; @@ -6319,7 +7020,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; - mddev_t *next_mddev, *mddev = v; + struct mddev *next_mddev, *mddev = v; ++*pos; if (v == (void*)2) @@ -6331,7 +7032,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) else tmp = mddev->all_mddevs.next; if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); else { next_mddev = (void*)2; *pos = 0x10000; @@ -6346,26 +7047,20 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void md_seq_stop(struct seq_file *seq, void *v) { - mddev_t *mddev = v; + struct mddev *mddev = v; if (mddev && v != (void*)1 && v != (void*)2) mddev_put(mddev); } -struct mdstat_info { - int event; -}; - static int md_seq_show(struct seq_file *seq, void *v) { - mddev_t *mddev = v; + struct mddev *mddev = v; sector_t sectors; - mdk_rdev_t *rdev; - struct mdstat_info *mi = seq->private; - struct bitmap *bitmap; + struct md_rdev *rdev; if (v == (void*)1) { - struct mdk_personality *pers; + struct md_personality *pers; seq_printf(seq, "Personalities : "); spin_lock(&pers_lock); list_for_each_entry(pers, &pers_list, list) @@ -6373,7 +7068,7 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&pers_lock); seq_printf(seq, "\n"); - mi->event = atomic_read(&md_event_count); + seq->poll_event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -6396,7 +7091,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6405,8 +7100,11 @@ static int md_seq_show(struct seq_file *seq, void *v) if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; - } else if (rdev->raid_disk < 0) + } + if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); sectors += rdev->sectors; } @@ -6439,7 +7137,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->curr_resync > 2) { status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -6447,27 +7145,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -6485,34 +7163,32 @@ static const struct seq_operations md_seq_ops = { static int md_seq_open(struct inode *inode, struct file *file) { + struct seq_file *seq; int error; - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); - if (mi == NULL) - return -ENOMEM; error = seq_open(file, &md_seq_ops); if (error) - kfree(mi); - else { - struct seq_file *p = file->private_data; - p->private = mi; - mi->event = atomic_read(&md_event_count); - } + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); return error; } +static int md_unloading; static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { - struct seq_file *m = filp->private_data; - struct mdstat_info *mi = m->private; + struct seq_file *seq = filp->private_data; int mask; + if (md_unloading) + return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ mask = POLLIN | POLLRDNORM; - if (mi->event != atomic_read(&md_event_count)) + if (seq->poll_event != atomic_read(&md_event_count)) mask |= POLLERR | POLLPRI; return mask; } @@ -6526,7 +7202,7 @@ static const struct file_operations md_seq_fops = { .poll = mdstat_poll, }; -int register_md_personality(struct mdk_personality *p) +int register_md_personality(struct md_personality *p) { spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); @@ -6535,7 +7211,7 @@ int register_md_personality(struct mdk_personality *p) return 0; } -int unregister_md_personality(struct mdk_personality *p) +int unregister_md_personality(struct md_personality *p) { printk(KERN_INFO "md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); @@ -6544,9 +7220,9 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev, int init) +static int is_mddev_idle(struct mddev *mddev, int init) { - mdk_rdev_t * rdev; + struct md_rdev * rdev; int idle; int curr_events; @@ -6588,13 +7264,14 @@ static int is_mddev_idle(mddev_t *mddev, int init) return idle; } -void md_done_sync(mddev_t *mddev, int blocks, int ok) +void md_done_sync(struct mddev *mddev, int blocks, int ok) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -6606,7 +7283,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) * in superblock) before writing, schedule a superblock update * and wait for it to complete. */ -void md_write_start(mddev_t *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) @@ -6641,7 +7318,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } -void md_write_end(mddev_t *mddev) +void md_write_end(struct mddev *mddev) { if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) @@ -6660,7 +7337,7 @@ void md_write_end(mddev_t *mddev) * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(mddev_t *mddev) +int md_allow_write(struct mddev *mddev) { if (!mddev->pers) return 0; @@ -6690,49 +7367,51 @@ int md_allow_write(mddev_t *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); -void md_unplug(mddev_t *mddev) -{ - if (mddev->queue) - blk_unplug(mddev->queue); - if (mddev->plug) - mddev->plug->unplug_fn(mddev->plug); -} - #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(mddev_t *mddev) +#define UPDATE_FREQUENCY (5*60*HZ) +void md_do_sync(struct md_thread *thread) { - mddev_t *mddev2; + struct mddev *mddev = thread->mddev; + struct mddev *mddev2; unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; + unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; sector_t last_check; int skipped = 0; - mdk_rdev_t *rdev; - char *desc; + struct md_rdev *rdev; + char *desc, *action = NULL; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; - else + action = "repair"; + } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; + mddev->last_sync_action = action ?: desc; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -6753,9 +7432,6 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; for_each_mddev(mddev2, tmp) { @@ -6780,7 +7456,7 @@ void md_do_sync(mddev_t *mddev) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" @@ -6804,7 +7480,7 @@ void md_do_sync(mddev_t *mddev) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; @@ -6812,19 +7488,32 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->dev_sectors; + max_sectors = mddev->resync_max_sectors; else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -6849,20 +7538,25 @@ void md_do_sync(mddev_t *mddev) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", - window/2,(unsigned long long) max_sectors/2); + printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); + update_time = jiffies; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -6872,19 +7566,24 @@ void md_do_sync(mddev_t *mddev) ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 >= mddev->resync_max - mddev->curr_resync_completed )) { /* time to update curr_resync_completed */ - md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; + update_time = jiffies; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - while (j >= mddev->resync_max && !kthread_should_stop()) { + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. @@ -6892,17 +7591,18 @@ void md_do_sync(mddev_t *mddev) flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -6910,11 +7610,15 @@ void md_do_sync(mddev_t *mddev) atomic_add(sectors, &mddev->recovery_active); } + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) - /* this is the earliers that rebuilt will be + /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(mddev); @@ -6923,10 +7627,6 @@ void md_do_sync(mddev_t *mddev) continue; last_check = io_sectors; - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ @@ -6939,10 +7639,8 @@ void md_do_sync(mddev_t *mddev) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -6952,7 +7650,6 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - md_unplug(mddev); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -6966,13 +7663,13 @@ void md_do_sync(mddev_t *mddev) } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: - md_unplug(mddev); - + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ @@ -6986,7 +7683,13 @@ void md_do_sync(mddev_t *mddev) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; @@ -6994,7 +7697,7 @@ void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && @@ -7004,9 +7707,9 @@ void md_do_sync(mddev_t *mddev) rcu_read_unlock(); } } + skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); - skip: if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7019,105 +7722,65 @@ void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); - -static int remove_and_add_spares(mddev_t *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; int spares = 0; + int removed = 0; - mddev->curr_resync_completed = 0; - - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0 && + rdev_for_each(rdev, mddev) + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; + removed++; } } + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (mddev->degraded && !mddev->recovery_disabled) { - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Blocked, &rdev->flags)) - spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } else - break; - } - } - } - return spares; -} + if (this) + goto no_add; -static void reap_sync_thread(mddev_t *mddev) -{ - mdk_rdev_t *rdev; + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + if (rdev->saved_raid_disk < 0) + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); +no_add: + if (removed) + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return spares; } /* @@ -7142,14 +7805,14 @@ static void reap_sync_thread(mddev_t *mddev) * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread. */ -void md_check_recovery(mddev_t *mddev) +void md_check_recovery(struct mddev *mddev) { + if (mddev->suspended) + return; + if (mddev->bitmap) bitmap_daemon_work(mddev); - if (mddev->ro) - return; - if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { printk(KERN_INFO "md: %s in immediate safe mode\n", @@ -7162,7 +7825,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -7175,23 +7838,19 @@ void md_check_recovery(mddev_t *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. */ - mdk_rdev_t *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - rdev->raid_disk = -1; - } - } + remove_and_add_spares(mddev, NULL); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -7214,7 +7873,7 @@ void md_check_recovery(mddev_t *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && @@ -7224,26 +7883,27 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ @@ -7254,7 +7914,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -7267,7 +7927,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; if (mddev->pers->sync_request) { - if (spares && mddev->bitmap && ! mddev->bitmap->file) { + if (spares) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. * So make sure all bitmap pages get written @@ -7293,6 +7953,8 @@ void md_check_recovery(mddev_t *mddev) md_new_event(mddev); } unlock: + wake_up(&mddev->sb_wait); + if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, @@ -7304,43 +7966,597 @@ void md_check_recovery(mddev_t *mddev) } } -void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +void md_reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. + */ + if (!mddev->degraded) + rdev_for_each(rdev, mddev) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + +void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags), + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); -static int md_notify_reboot(struct notifier_block *this, - unsigned long code, void *x) +void md_finish_reshape(struct mddev *mddev) { - struct list_head *tmp; - mddev_t *mddev; + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); - printk(KERN_INFO "md: stopping all md devices.\n"); +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* This could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; - for_each_mddev(mddev, tmp) - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal */ - md_set_readonly(mddev, 100); - mddev_unlock(mddev); + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) { + /* No room for more */ + rv = 0; + break; + } else { + int this_sectors = sectors; + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} + +int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = md_set_badblocks(&rdev->badblocks, + s, sectors, 0); + if (rv) { + /* Make sure they get written out promptly */ + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + md_wakeup_thread(rdev->mddev->thread); + } + return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. */ - mdelay(1000*1); + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} + +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + return md_clear_badblocks(&rdev->badblocks, + s, sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ + unsigned long long sector; + int length; + char newline; +#ifdef DO_DEBUG + /* Allow clearing via sysfs *only* for testing/debugging. + * Normally only a successful write may clear a badblock + */ + int clear = 0; + if (page[0] == '-') { + clear = 1; + page++; + } +#endif /* DO_DEBUG */ + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + +#ifdef DO_DEBUG + if (clear) { + md_clear_badblocks(bb, sector, length); + return len; + } +#endif /* DO_DEBUG */ + if (md_set_badblocks(bb, sector, length, !unack)) + return len; + else + return -ENOSPC; +} + +static int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + struct mddev *mddev; + int need_delay = 0; + + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + if (mddev->pers) + __md_stop_writes(mddev); + if (mddev->persistent) + mddev->safemode = 2; + mddev_unlock(mddev); + } + need_delay = 1; + } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } @@ -7352,7 +8568,7 @@ static struct notifier_block md_notifier = { static void md_geninit(void) { - dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); } @@ -7361,7 +8577,7 @@ static int __init md_init(void) { int ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_RESCUER, 0); + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -7427,7 +8643,7 @@ void md_autodetect_dev(dev_t dev) static void autostart_arrays(int part) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; dev_t dev; int i_scanned, i_passed; @@ -7467,8 +8683,9 @@ static void autostart_arrays(int part) static __exit void md_exit(void) { - mddev_t *mddev; + struct mddev *mddev; struct list_head *tmp; + int delay = 1; blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); @@ -7477,7 +8694,19 @@ static __exit void md_exit(void) unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { export_array(mddev); mddev->hold_active = 0; @@ -7519,6 +8748,7 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); |
