diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/md/raid10.c |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 1787 |
1 files changed, 1787 insertions, 0 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c new file mode 100644 index 00000000000..b100bfe4fdc --- /dev/null +++ b/drivers/md/raid10.c @@ -0,0 +1,1787 @@ +/* + * raid10.c : Multiple Devices driver for Linux + * + * Copyright (C) 2000-2004 Neil Brown + * + * RAID-10 support for md. + * + * Base on code in raid1.c. See raid1.c for futher copyright information. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/raid/raid10.h> + +/* + * RAID10 provides a combination of RAID0 and RAID1 functionality. + * The layout of data is defined by + * chunk_size + * raid_disks + * near_copies (stored in low byte of layout) + * far_copies (stored in second byte of layout) + * + * The data to be stored is divided into chunks using chunksize. + * Each device is divided into far_copies sections. + * In each section, chunks are laid out in a style similar to raid0, but + * near_copies copies of each chunk is stored (each on a different drive). + * The starting device for each section is offset near_copies from the starting + * device of the previous section. + * Thus there are (near_copies*far_copies) of each chunk, and each is on a different + * drive. + * near_copies and far_copies must be at least one, and their product is at most + * raid_disks. + */ + +/* + * Number of guaranteed r10bios in case of extreme VM load: + */ +#define NR_RAID10_BIOS 256 + +static void unplug_slaves(mddev_t *mddev); + +static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ + conf_t *conf = data; + r10bio_t *r10_bio; + int size = offsetof(struct r10bio_s, devs[conf->copies]); + + /* allocate a r10bio with room for raid_disks entries in the bios array */ + r10_bio = kmalloc(size, gfp_flags); + if (r10_bio) + memset(r10_bio, 0, size); + else + unplug_slaves(conf->mddev); + + return r10_bio; +} + +static void r10bio_pool_free(void *r10_bio, void *data) +{ + kfree(r10_bio); +} + +#define RESYNC_BLOCK_SIZE (64*1024) +//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +#define RESYNC_WINDOW (2048*1024) + +/* + * When performing a resync, we need to read and compare, so + * we need as many pages are there are copies. + * When performing a recovery, we need 2 bios, one for read, + * one for write (we recover only one drive per r10buf) + * + */ +static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data) +{ + conf_t *conf = data; + struct page *page; + r10bio_t *r10_bio; + struct bio *bio; + int i, j; + int nalloc; + + r10_bio = r10bio_pool_alloc(gfp_flags, conf); + if (!r10_bio) { + unplug_slaves(conf->mddev); + return NULL; + } + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + /* + * Allocate bios. + */ + for (j = nalloc ; j-- ; ) { + bio = bio_alloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].bio = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them + * where needed. + */ + for (j = 0 ; j < nalloc; j++) { + bio = r10_bio->devs[j].bio; + for (i = 0; i < RESYNC_PAGES; i++) { + page = alloc_page(gfp_flags); + if (unlikely(!page)) + goto out_free_pages; + + bio->bi_io_vec[i].bv_page = page; + } + } + + return r10_bio; + +out_free_pages: + for ( ; i > 0 ; i--) + __free_page(bio->bi_io_vec[i-1].bv_page); + while (j--) + for (i = 0; i < RESYNC_PAGES ; i++) + __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + j = -1; +out_free_bio: + while ( ++j < nalloc ) + bio_put(r10_bio->devs[j].bio); + r10bio_pool_free(r10_bio, conf); + return NULL; +} + +static void r10buf_pool_free(void *__r10_bio, void *data) +{ + int i; + conf_t *conf = data; + r10bio_t *r10bio = __r10_bio; + int j; + + for (j=0; j < conf->copies; j++) { + struct bio *bio = r10bio->devs[j].bio; + if (bio) { + for (i = 0; i < RESYNC_PAGES; i++) { + __free_page(bio->bi_io_vec[i].bv_page); + bio->bi_io_vec[i].bv_page = NULL; + } + bio_put(bio); + } + } + r10bio_pool_free(r10bio, conf); +} + +static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) +{ + int i; + + for (i = 0; i < conf->copies; i++) { + struct bio **bio = & r10_bio->devs[i].bio; + if (*bio) + bio_put(*bio); + *bio = NULL; + } +} + +static inline void free_r10bio(r10bio_t *r10_bio) +{ + unsigned long flags; + + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + spin_lock_irqsave(&conf->resync_lock, flags); + if (!--conf->nr_pending) { + wake_up(&conf->wait_idle); + wake_up(&conf->wait_resume); + } + spin_unlock_irqrestore(&conf->resync_lock, flags); + + put_all_bios(conf, r10_bio); + mempool_free(r10_bio, conf->r10bio_pool); +} + +static inline void put_buf(r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(r10_bio->mddev); + unsigned long flags; + + mempool_free(r10_bio, conf->r10buf_pool); + + spin_lock_irqsave(&conf->resync_lock, flags); + if (!conf->barrier) + BUG(); + --conf->barrier; + wake_up(&conf->wait_resume); + wake_up(&conf->wait_idle); + + if (!--conf->nr_pending) { + wake_up(&conf->wait_idle); + wake_up(&conf->wait_resume); + } + spin_unlock_irqrestore(&conf->resync_lock, flags); +} + +static void reschedule_retry(r10bio_t *r10_bio) +{ + unsigned long flags; + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev_to_conf(mddev); + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r10_bio->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); + + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(r10bio_t *r10_bio) +{ + struct bio *bio = r10_bio->master_bio; + + bio_endio(bio, bio->bi_size, + test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); + free_r10bio(r10_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int slot, r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + conf->mirrors[r10_bio->devs[slot].devnum].head_position = + r10_bio->devs[slot].addr + (r10_bio->sectors); +} + +static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + int slot, dev; + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + if (bio->bi_size) + return 1; + + slot = r10_bio->read_slot; + dev = r10_bio->devs[slot].devnum; + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error(r10_bio->mddev, conf->mirrors[dev].rdev); + else + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + + update_head_pos(slot, r10_bio); + + /* + * we have only one bio on the read side + */ + if (uptodate) + raid_end_bio_io(r10_bio); + else { + /* + * oops, read error: + */ + char b[BDEVNAME_SIZE]; + if (printk_ratelimit()) + printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", + bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); + reschedule_retry(r10_bio); + } + + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + return 0; +} + +static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + int slot, dev; + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + if (bio->bi_size) + return 1; + + for (slot = 0; slot < conf->copies; slot++) + if (r10_bio->devs[slot].bio == bio) + break; + dev = r10_bio->devs[slot].devnum; + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error(r10_bio->mddev, conf->mirrors[dev].rdev); + else + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + + update_head_pos(slot, r10_bio); + + /* + * + * Let's see if all mirrored write operations have finished + * already. + */ + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + return 0; +} + + +/* + * RAID10 layout manager + * Aswell as the chunksize and raid_disks count, there are two + * parameters: near_copies and far_copies. + * near_copies * far_copies must be <= raid_disks. + * Normally one of these will be 1. + * If both are 1, we get raid0. + * If near_copies == raid_disks, we get raid1. + * + * Chunks are layed out in raid0 style with near_copies copies of the + * first chunk, followed by near_copies copies of the next chunk and + * so on. + * If far_copies > 1, then after 1/far_copies of the array has been assigned + * as described above, we start again with a device offset of near_copies. + * So we effectively have another copy of the whole array further down all + * the drives, but with blocks on different drives. + * With this layout, and block is never stored twice on the one device. + * + * raid10_find_phys finds the sector offset of a given virtual sector + * on each device that it is on. If a block isn't on a device, + * that entry in the array is set to MaxSector. + * + * raid10_find_virt does the reverse mapping, from a device and a + * sector offset to a virtual address + */ + +static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) +{ + int n,f; + sector_t sector; + sector_t chunk; + sector_t stripe; + int dev; + + int slot = 0; + + /* now calculate first sector/dev */ + chunk = r10bio->sector >> conf->chunk_shift; + sector = r10bio->sector & conf->chunk_mask; + + chunk *= conf->near_copies; + stripe = chunk; + dev = sector_div(stripe, conf->raid_disks); + + sector += stripe << conf->chunk_shift; + + /* and calculate all the others */ + for (n=0; n < conf->near_copies; n++) { + int d = dev; + sector_t s = sector; + r10bio->devs[slot].addr = sector; + r10bio->devs[slot].devnum = d; + slot++; + + for (f = 1; f < conf->far_copies; f++) { + d += conf->near_copies; + if (d >= conf->raid_disks) + d -= conf->raid_disks; + s += conf->stride; + r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; + slot++; + } + dev++; + if (dev >= conf->raid_disks) { + dev = 0; + sector += (conf->chunk_mask + 1); + } + } + BUG_ON(slot != conf->copies); +} + +static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) +{ + sector_t offset, chunk, vchunk; + + while (sector > conf->stride) { + sector -= conf->stride; + if (dev < conf->near_copies) + dev += conf->raid_disks - conf->near_copies; + else + dev -= conf->near_copies; + } + + offset = sector & conf->chunk_mask; + chunk = sector >> conf->chunk_shift; + vchunk = chunk * conf->raid_disks + dev; + sector_div(vchunk, conf->near_copies); + return (vchunk << conf->chunk_shift) + offset; +} + +/** + * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged + * @q: request queue + * @bio: the buffer head that's been built up so far + * @biovec: the request that could be merged to it. + * + * Return amount of bytes we can accept at this offset + * If near_copies == raid_disk, there are no striping issues, + * but in that case, the function isn't called at all. + */ +static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, + struct bio_vec *bio_vec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; /* bio_add cannot handle a negative return */ + if (max <= bio_vec->bv_len && bio_sectors == 0) + return bio_vec->bv_len; + else + return max; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ + +/* + * FIXME: possibly should rethink readbalancing and do it differently + * depending on near_copies / far_copies geometry. + */ +static int read_balance(conf_t *conf, r10bio_t *r10_bio) +{ + const unsigned long this_sector = r10_bio->sector; + int disk, slot, nslot; + const int sectors = r10_bio->sectors; + sector_t new_distance, current_distance; + + raid10_find_phys(conf, r10_bio); + rcu_read_lock(); + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ + if (conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) { + /* make sure that disk is operational */ + slot = 0; + disk = r10_bio->devs[slot].devnum; + + while (!conf->mirrors[disk].rdev || + !conf->mirrors[disk].rdev->in_sync) { + slot++; + if (slot == conf->copies) { + slot = 0; + disk = -1; + break; + } + disk = r10_bio->devs[slot].devnum; + } + goto rb_out; + } + + + /* make sure the disk is operational */ + slot = 0; + disk = r10_bio->devs[slot].devnum; + while (!conf->mirrors[disk].rdev || + !conf->mirrors[disk].rdev->in_sync) { + slot ++; + if (slot == conf->copies) { + disk = -1; + goto rb_out; + } + disk = r10_bio->devs[slot].devnum; + } + + + current_distance = abs(this_sector - conf->mirrors[disk].head_position); + + /* Find the disk whose head is closest */ + + for (nslot = slot; nslot < conf->copies; nslot++) { + int ndisk = r10_bio->devs[nslot].devnum; + + + if (!conf->mirrors[ndisk].rdev || + !conf->mirrors[ndisk].rdev->in_sync) + continue; + + if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { + disk = ndisk; + slot = nslot; + break; + } + new_distance = abs(r10_bio->devs[nslot].addr - + conf->mirrors[ndisk].head_position); + if (new_distance < current_distance) { + current_distance = new_distance; + disk = ndisk; + slot = nslot; + } + } + +rb_out: + r10_bio->read_slot = slot; +/* conf->next_seq_sect = this_sector + sectors;*/ + + if (disk >= 0 && conf->mirrors[disk].rdev) + atomic_inc(&conf->mirrors[disk].rdev->nr_pending); + rcu_read_unlock(); + + return disk; +} + +static void unplug_slaves(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + int i; + + rcu_read_lock(); + for (i=0; i<mddev->raid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + rcu_read_unlock(); +} + +static void raid10_unplug(request_queue_t *q) +{ + unplug_slaves(q->queuedata); +} + +static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i=0; i<mddev->raid_disks && ret == 0; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) + ret = -EOPNOTSUPP; + else { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, + error_sector); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + } + rcu_read_unlock(); + return ret; +} + +/* + * Throttle resync depth, so that we can both get proper overlapping of + * requests, but are still able to handle normal requests quickly. + */ +#define RESYNC_DEPTH 32 + +static void device_barrier(conf_t *conf, sector_t sect) +{ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), + conf->resync_lock, unplug_slaves(conf->mddev)); + + if (!conf->barrier++) { + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, unplug_slaves(conf->mddev)); + if (conf->nr_pending) + BUG(); + } + wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, + conf->resync_lock, unplug_slaves(conf->mddev)); + conf->next_resync = sect; + spin_unlock_irq(&conf->resync_lock); +} + +static int make_request(request_queue_t *q, struct bio * bio) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + mirror_info_t *mirror; + r10bio_t *r10_bio; + struct bio *read_bio; + int i; + int chunk_sects = conf->chunk_mask + 1; + + /* If this request crosses a chunk boundary, we need to + * split it. This will only happen for 1 PAGE (or less) requests. + */ + if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) + > chunk_sects && + conf->near_copies < conf->raid_disks)) { + struct bio_pair *bp; + /* Sanity check -- queue functions should prevent this happening */ + if (bio->bi_vcnt != 1 || + bio->bi_idx != 0) + goto bad_map; + /* This is a one page bio that upper layers + * refuse to split for us, so we need to split it. + */ + bp = bio_split(bio, bio_split_pool, + chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); + if (make_request(q, &bp->bio1)) + generic_make_request(&bp->bio1); + if (make_request(q, &bp->bio2)) + generic_make_request(&bp->bio2); + + bio_pair_release(bp); + return 0; + bad_map: + printk("raid10_make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", chunk_sects/2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + + bio_io_error(bio, bio->bi_size); + return 0; + } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); + + if (bio_data_dir(bio)==WRITE) { + disk_stat_inc(mddev->gendisk, writes); + disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); + } else { + disk_stat_inc(mddev->gendisk, reads); + disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); + } + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio->bi_size >> 9; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector; + + if (bio_data_dir(bio) == READ) { + /* + * read balancing logic: + */ + int disk = read_balance(conf, r10_bio); + int slot = r10_bio->read_slot; + if (disk < 0) { + raid_end_bio_io(r10_bio); + return 0; + } + mirror = conf->mirrors + disk; + + read_bio = bio_clone(bio, GFP_NOIO); + + r10_bio->devs[slot].bio = read_bio; + + read_bio->bi_sector = r10_bio->devs[slot].addr + + mirror->rdev->data_offset; + read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_end_io = raid10_end_read_request; + read_bio->bi_rw = READ; + read_bio->bi_private = r10_bio; + + generic_make_request(read_bio); + return 0; + } + + /* + * WRITE: + */ + /* first select target devices under spinlock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ + raid10_find_phys(conf, r10_bio); + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + if (conf->mirrors[d].rdev && + !conf->mirrors[d].rdev->faulty) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + r10_bio->devs[i].bio = bio; + } else + r10_bio->devs[i].bio = NULL; + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + md_write_start(mddev); + for (i = 0; i < conf->copies; i++) { + struct bio *mbio; + int d = r10_bio->devs[i].devnum; + if (!r10_bio->devs[i].bio) + continue; + + mbio = bio_clone(bio, GFP_NOIO); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = r10_bio->devs[i].addr+ + conf->mirrors[d].rdev->data_offset; + mbio->bi_bdev = conf->mirrors[d].rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + generic_make_request(mbio); + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(mddev); + raid_end_bio_io(r10_bio); + } + + return 0; +} + +static void status(struct seq_file *seq, mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + int i; + + if (conf->near_copies < conf->raid_disks) + seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); + if (conf->near_copies > 1) + seq_printf(seq, " %d near-copies", conf->near_copies); + if (conf->far_copies > 1) + seq_printf(seq, " %d far-copies", conf->far_copies); + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, "%s", + conf->mirrors[i].rdev && + conf->mirrors[i].rdev->in_sync ? "U" : "_"); + seq_printf(seq, "]"); +} + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char b[BDEVNAME_SIZE]; + conf_t *conf = mddev_to_conf(mddev); + + /* + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + if (rdev->in_sync + && conf->working_disks == 1) + /* + * Don't fail the drive, just return an IO error. + * The test should really be more sophisticated than + * "working_disks == 1", but it isn't critical, and + * can wait until we do more sophisticated "is the drive + * really dead" tests... + */ + return; + if (rdev->in_sync) { + mddev->degraded++; + conf->working_disks--; + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + } + rdev->in_sync = 0; + rdev->faulty = 1; + mddev->sb_dirty = 1; + printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" + " Operation continuing on %d devices\n", + bdevname(rdev->bdev,b), conf->working_disks); +} + +static void print_conf(conf_t *conf) +{ + int i; + mirror_info_t *tmp; + + printk("RAID10 conf printout:\n"); + if (!conf) { + printk("(!conf)\n"); + return; + } + printk(" --- wd:%d rd:%d\n", conf->working_disks, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->mirrors + i; + if (tmp->rdev) + printk(" disk %d, wo:%d, o:%d, dev:%s\n", + i, !tmp->rdev->in_sync, !tmp->rdev->faulty, + bdevname(tmp->rdev->bdev,b)); + } +} + +static void close_sync(conf_t *conf) +{ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, + conf->resync_lock, unplug_slaves(conf->mddev)); + spin_unlock_irq(&conf->resync_lock); + + if (conf->barrier) BUG(); + if (waitqueue_active(&conf->wait_idle)) BUG(); + + mempool_destroy(conf->r10buf_pool); + conf->r10buf_pool = NULL; +} + +static int raid10_spare_active(mddev_t *mddev) +{ + int i; + conf_t *conf = mddev->private; + mirror_info_t *tmp; + + /* + * Find all non-in_sync disks within the RAID10 configuration + * and mark them in_sync + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if (tmp->rdev + && !tmp->rdev->faulty + && !tmp->rdev->in_sync) { + conf->working_disks++; + mddev->degraded--; + tmp->rdev->in_sync = 1; + } + } + + print_conf(conf); + return 0; +} + + +static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + conf_t *conf = mddev->private; + int found = 0; + int mirror; + mirror_info_t *p; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is + * very different from resync + */ + return 0; + + for (mirror=0; mirror < mddev->raid_disks; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + rdev->bdev->bd_disk->queue); + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_sector to one PAGE, as + * a one page request is never in violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn && + mddev->queue->max_sectors > (PAGE_SIZE>>9)) + mddev->queue->max_sectors = (PAGE_SIZE>>9); + + p->head_position = 0; + rdev->raid_disk = mirror; + found = 1; + p->rdev = rdev; + break; + } + + print_conf(conf); + return found; +} + +static int raid10_remove_disk(mddev_t *mddev, int number) +{ + conf_t *conf = mddev->private; + int err = 0; + mdk_rdev_t *rdev; + mirror_info_t *p = conf->mirrors+ number; + + print_conf(conf); + rdev = p->rdev; + if (rdev) { + if (rdev->in_sync || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + synchronize_kernel(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + } + } +abort: + + print_conf(conf); + return err; +} + + +static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + conf_t *conf = mddev_to_conf(r10_bio->mddev); + int i,d; + + if (bio->bi_size) + return 1; + + for (i=0; i<conf->copies; i++) + if (r10_bio->devs[i].bio == bio) + break; + if (i == conf->copies) + BUG(); + update_head_pos(i, r10_bio); + d = r10_bio->devs[i].devnum; + if (!uptodate) + md_error(r10_bio->mddev, + conf->mirrors[d].rdev); + + /* for reconstruct, we always reschedule after a read. + * for resync, only after all reads + */ + if (test_bit(R10BIO_IsRecover, &r10_bio->state) || + atomic_dec_and_test(&r10_bio->remaining)) { + /* we have read all the blocks, + * do the comparison in process context in raid10d + */ + reschedule_retry(r10_bio); + } + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); + return 0; +} + +static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev_to_conf(mddev); + int i,d; + + if (bio->bi_size) + return 1; + + for (i = 0; i < conf->copies; i++) + if (r10_bio->devs[i].bio == bio) + break; + d = r10_bio->devs[i].devnum; + + if (!uptodate) + md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); + + while (atomic_dec_and_test(&r10_bio->remaining)) { + if (r10_bio->master_bio == NULL) { + /* the primary of several recovery bios */ + md_done_sync(mddev, r10_bio->sectors, 1); + put_buf(r10_bio); + break; + } else { + r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; + put_buf(r10_bio); + r10_bio = r10_bio2; + } + } + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + return 0; +} + +/* + * Note: sync and recover and handled very differently for raid10 + * This code is for resync. + * For resync, we read through virtual addresses and read all blocks. + * If there is any error, we schedule a write. The lowest numbered + * drive is authoritative. + * However requests come for physical address, so we need to map. + * For every physical address there are raid_disks/copies virtual addresses, + * which is always are least one, but is not necessarly an integer. + * This means that a physical address can span multiple chunks, so we may + * have to submit multiple io requests for a single sync request. + */ +/* + * We check if all blocks are in-sync and only write to blocks that + * aren't in sync + */ +static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(mddev); + int i, first; + struct bio *tbio, *fbio; + + atomic_set(&r10_bio->remaining, 1); + + /* find the first device with a block */ + for (i=0; i<conf->copies; i++) + if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + break; + + if (i == conf->copies) + goto done; + + first = i; + fbio = r10_bio->devs[i].bio; + + /* now find blocks with errors */ + for (i=first+1 ; i < conf->copies ; i++) { + int vcnt, j, d; + + if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + continue; + /* We know that the bi_io_vec layout is the same for + * both 'first' and 'i', so we just compare them. + * All vec entries are PAGE_SIZE; + */ + tbio = r10_bio->devs[i].bio; + vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); + for (j = 0; j < vcnt; j++) + if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), + page_address(tbio->bi_io_vec[j].bv_page), + PAGE_SIZE)) + break; + if (j == vcnt) + continue; + /* Ok, we need to write this bio + * First we need to fixup bv_offset, bv_len and + * bi_vecs, as the read request might have corrupted these + */ + tbio->bi_vcnt = vcnt; + tbio->bi_size = r10_bio->sectors << 9; + tbio->bi_idx = 0; + tbio->bi_phys_segments = 0; + tbio->bi_hw_segments = 0; + tbio->bi_hw_front_size = 0; + tbio->bi_hw_back_size = 0; + tbio->bi_flags &= ~(BIO_POOL_MASK - 1); + tbio->bi_flags |= 1 << BIO_UPTODATE; + tbio->bi_next = NULL; + tbio->bi_rw = WRITE; + tbio->bi_private = r10_bio; + tbio->bi_sector = r10_bio->devs[i].addr; + + for (j=0; j < vcnt ; j++) { + tbio->bi_io_vec[j].bv_offset = 0; + tbio->bi_io_vec[j].bv_len = PAGE_SIZE; + + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + tbio->bi_end_io = end_sync_write; + + d = r10_bio->devs[i].devnum; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); |