diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 3766 |
1 files changed, 3766 insertions, 0 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 00000000000..04562add192 --- /dev/null +++ b/drivers/md/md.c @@ -0,0 +1,3766 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> + - kerneld support by Boris Tobotras <boris@xtalk.msk.su> + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> + - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown <neilb@cse.unsw.edu.au>. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/linkage.h> +#include <linux/raid/md.h> +#include <linux/sysctl.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/buffer_head.h> /* for invalidate_bdev */ +#include <linux/suspend.h> + +#include <linux/init.h> + +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +#include <asm/unaligned.h> + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (int part); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static DEFINE_SPINLOCK(pers_lock); + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { + { + .ctl_name = DEV_RAID, + .procname = "raid", + .maxlen = 0, + .mode = 0555, + .child = raid_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { + { + .ctl_name = CTL_DEV, + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list. + */ +static LIST_HEAD(all_mddevs); +static DEFINE_SPINLOCK(all_mddevs_lock); + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + tmp = all_mddevs.next; \ + mddev = NULL;}); \ + ({ if (tmp != &all_mddevs) \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (mddev) mddev_put(mddev); \ + mddev = list_entry(tmp, mddev_t, all_mddevs); \ + tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + tmp = tmp->next;}) \ + ) + + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio, bio->bi_size); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + blk_put_queue(mddev->queue); + kfree(mddev); + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(dev_t unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + + if (new) { + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + + init_MUTEX(&new->reconfig_sem); + INIT_LIST_HEAD(&new->disks); + INIT_LIST_HEAD(&new->all_mddevs); + init_timer(&new->safemode_timer); + atomic_set(&new->active, 1); + + new->queue = blk_alloc_queue(GFP_KERNEL); + if (!new->queue) { + kfree(new); + return NULL; + } + + blk_queue_make_request(new->queue, md_fail_request); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ + down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); + + if (mddev->thread) + md_wakeup_thread(mddev->thread); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->bdev->bd_dev == dev) + return rdev; + } + return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ + sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ + sector_t size; + + size = rdev->sb_offset; + + if (chunk_size) + size &= ~((sector_t)chunk_size/1024 - 1); + return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -EINVAL; + } + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw) +{ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct completion event; + int ret; + + rw |= (1 << BIO_RW_SYNC); + + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + char b[BDEVNAME_SIZE]; + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", + bdevname(rdev->bdev,b)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + if ( (sb1->set_uuid0 == sb2->set_uuid0) && + (sb1->set_uuid1 == sb2->set_uuid1) && + (sb1->set_uuid2 == sb2->set_uuid2) && + (sb1->set_uuid3 == sb2->set_uuid3)) + + return 1; + + return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0 + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + mdp_super_t *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock, + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + ret = -EINVAL; + + bdevname(rdev->bdev, b); + sb = (mdp_super_t*)page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + b); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version != 90) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + b); + goto abort; + } + + if (sb->raid_disks <= 0) + goto abort; + + if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + b); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + + if (sb->level == LEVEL_MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (refdev == 0) + ret = 1; + else { + __u64 ev1, ev2; + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + b, bdevname(refdev->bdev,b2)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + b, bdevname(refdev->bdev, b2)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->persistent = ! sb->not_persistent; + mddev->chunk_size = sb->chunk_size; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->size = sb->size; + mddev->events = md_event(sb); + + if (sb->state & (1<<MD_SB_CLEAN)) + mddev->recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + } else { + __u64 ev1; + ev1 = md_event(sb); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + if (mddev->level != LEVEL_MULTIPATH) { + rdev->raid_disk = -1; + rdev->in_sync = rdev->faulty = 0; + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<<MD_DISK_FAULTY)) + rdev->faulty = 1; + else if (desc->state & (1<<MD_DISK_SYNC) && + desc->raid_disk < mddev->raid_disks) { + rdev->in_sync = 1; + rdev->raid_disk = desc->raid_disk; + } + } + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_super_t *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); + * 3/ any empty disks < next_spare become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->minor_version = mddev->minor_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->size; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->md_minor; + sb->not_persistent = !mddev->persistent; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_size; + + sb->disks[0].state = (1<<MD_DISK_REMOVED); + ITERATE_RDEV(mddev,rdev2,tmp) { + mdp_disk_t *d; + if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + rdev2->desc_nr = rdev2->raid_disk; + else + rdev2->desc_nr = next_spare++; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (rdev2->faulty) { + d->state = (1<<MD_DISK_FAULTY); + failed++; + } else if (rdev2->in_sync) { + d->state = (1<<MD_DISK_ACTIVE); + d->state |= (1<<MD_DISK_SYNC); + active++; + working++; + } else { + d->state = 0; + spare++; + working++; + } + } + + /* now set the "removed" and "faulty" bits on any missing devices */ + for (i=0 ; i < mddev->raid_disks ; i++) { + mdp_disk_t *d = &sb->disks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<<MD_DISK_REMOVED); + d->state |= (1<<MD_DISK_FAULTY); + failed++; + } + } + sb->nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + unsigned long long newcsum; + int size = 256 + le32_to_cpu(sb->max_dev)*2; + unsigned int *isuper = (unsigned int*)sb; + int i; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (i=0; size>=4; size -= 4 ) + newcsum += le32_to_cpu(*isuper++); + + if (size == 2) + newcsum += le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return cpu_to_le32(csum); +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_offset; + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_offset = rdev->bdev->bd_inode->i_size >> 9; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + /* convert from sectors to K */ + sb_offset /= 2; + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4; + break; + default: + return -EINVAL; + } + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + sb->feature_map != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } + if (le64_to_cpu(sb->data_size) < 10) { + printk("md: data_size too small on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + + if (refdev == 0) + return 1; + else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdevname(rdev->bdev,b), + bdevname(refdev->bdev,b2)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + return 1; + } + if (minor_version) + rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + else + rdev->size = rdev->sb_offset; + if (rdev->size < le64_to_cpu(sb->data_size)/2) + return -EINVAL; + rdev->size = le64_to_cpu(sb->data_size)/2; + if (le32_to_cpu(sb->chunksize)) + rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->patch_version = 0; + mddev->persistent = 1; + mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->size = le64_to_cpu(sb->size)/2; + mddev->events = le64_to_cpu(sb->events); + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + } else { + __u64 ev1; + ev1 = le64_to_cpu(sb->events); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + + if (mddev->level != LEVEL_MULTIPATH) { + int role; + rdev->desc_nr = le32_to_cpu(sb->dev_number); + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + rdev->in_sync = 0; + rdev->faulty = 0; + rdev->raid_disk = -1; + break; + case 0xfffe: /* faulty */ + rdev->in_sync = 0; + rdev->faulty = 1; + rdev->raid_disk = -1; + break; + default: + rdev->in_sync = 1; + rdev->faulty = 0; + rdev->raid_disk = role; + break; + } + } + return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + max_dev = 0; + ITERATE_RDEV(mddev,rdev2,tmp) + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; + + sb->max_dev = cpu_to_le32(max_dev); + for (i=0; i<max_dev;i++) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + + ITERATE_RDEV(mddev,rdev2,tmp) { + i = rdev2->desc_nr; + if (rdev2->faulty) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (rdev2->in_sync) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ + sb->sb_csum = calc_sb_1_csum(sb); +} + + +struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + }, +}; + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->bdev->bd_contains == dev->bdev->bd_contains) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev)) + return 1; + + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + same_pdev = match_dev_unit(mddev, rdev); + if (same_pdev) + printk(KERN_WARNING + "%s: WARNING: %s appears to be on the same physical" + " disk as %s. True\n protection against single-disk" + " failure might be compromised.\n", + mdname(mddev), bdevname(rdev->bdev,b), + bdevname(same_pdev->bdev,b2)); + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); + return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + char b[BDEVNAME_SIZE]; + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by bd_claiming the device. + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ + int err = 0; + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + if (IS_ERR(bdev)) { + printk(KERN_ERR "md: could not open %s.\n", + __bdevname(dev, b)); + return PTR_ERR(bdev); + } + err = bd_claim(bdev, rdev); + if (err) { + printk(KERN_ERR "md: could not bd_claim %s.\n", + bdevname(bdev, b)); + blkdev_put(bdev); + return err; + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + bd_release(bdev); + blkdev_put(bdev); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md: export_rdev(%s)\n", + bdevname(rdev->bdev,b)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del_init(&rdev->same_set); +#ifndef MODULE + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + rdev->faulty, rdev->in_sync, rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb((mdp_super_t*)page_address(rdev->sb_page)); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + char b[BDEVNAME_SIZE]; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("%s: ", mdname(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", bdevname(rdev->bdev,b)); + printk("\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + char b[BDEVNAME_SIZE]; + if (!rdev->sb_loaded) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdevname(rdev->bdev,b), + (unsigned long long)rdev->sb_offset); + + if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) + return 0; + + printk("md: write_disk_sb failed for device %s\n", + bdevname(rdev->bdev,b)); + return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ |