diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-03 09:08:19 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-03 09:08:19 -0700 |
commit | 223cdea4c4b5af5181b2da00ac85711d1e0c737c (patch) | |
tree | dfe7226c70ddabbf2e2e63924ba636345278e79c /drivers/md | |
parent | 31e6e2dac575c9d21a6ec56ca52ae89086baa705 (diff) | |
parent | c8f517c444e4f9f55b5b5ca202b8404691a35805 (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (53 commits)
md/raid5 revise rules for when to update metadata during reshape
md/raid5: minor code cleanups in make_request.
md: remove CONFIG_MD_RAID_RESHAPE config option.
md/raid5: be more careful about write ordering when reshaping.
md: don't display meaningless values in sysfs files resync_start and sync_speed
md/raid5: allow layout and chunksize to be changed on active array.
md/raid5: reshape using largest of old and new chunk size
md/raid5: prepare for allowing reshape to change layout
md/raid5: prepare for allowing reshape to change chunksize.
md/raid5: clearly differentiate 'before' and 'after' stripes during reshape.
Documentation/md.txt update
md: allow number of drives in raid5 to be reduced
md/raid5: change reshape-progress measurement to cope with reshaping backwards.
md: add explicit method to signal the end of a reshape.
md/raid5: enhance raid5_size to work correctly with negative delta_disks
md/raid5: drop qd_idx from r6_state
md/raid6: move raid6 data processing to raid6_pq.ko
md: raid5 run(): Fix max_degraded for raid level 4.
md: 'array_size' sysfs attribute
md: centralize ->array_sectors modifications
...
Diffstat (limited to 'drivers/md')
31 files changed, 3324 insertions, 837 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b5098e9..36e0675be9f 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,6 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR ---help--- @@ -151,34 +152,8 @@ config MD_RAID456 If unsure, say Y. -config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array" - depends on MD_RAID456 - default y - ---help--- - A RAID-5 set can be expanded by adding extra drives. This - requires "restriping" the array which means (almost) every - block must be written to a different place. - - This option allows such restriping to be done while the array - is online. - - You will need mdadm version 2.4.1 or later to use this - feature safely. During the early stage of reshape there is - a critical section where live data is being over-written. A - crash during this time needs extra care for recovery. The - newer mdadm takes a copy of the data in the critical section - and will restore it, if necessary, after a crash. - - The mdadm usage is e.g. - mdadm --grow /dev/md1 --raid-disks=6 - to grow '/dev/md1' to having 6 disks. - - Note: The array can only be expanded, not contracted. - There should be enough spares already present to make the new - array workable. - - If unsure, say Y. +config MD_RAID6_PQ + tristate config MD_MULTIPATH tristate "Multipath I/O support" diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 72880b7e28d..45cc5951d92 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -2,20 +2,21 @@ # Makefile for the kernel software RAID and LVM drivers. # -dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o -dm-multipath-objs := dm-path-selector.o dm-mpath.o -dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o -dm-mirror-objs := dm-raid1.o -md-mod-objs := md.o bitmap.o -raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ +dm-mirror-y += dm-raid1.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ raid6altivec8.o \ raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y := mktables +hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 71994376339..f8a9f7ab2cb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -16,6 +16,7 @@ * wait if count gets too high, wake when it drops to half. */ +#include <linux/blkdev.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> @@ -26,8 +27,8 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include "md.h" +#include "bitmap.h" /* debug macros */ @@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat unsigned char *mappage; if (page >= bitmap->pages) { - printk(KERN_ALERT - "%s: invalid bitmap page request: %lu (> %lu)\n", - bmname(bitmap), page, bitmap->pages-1); + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ return -EINVAL; } @@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) list_for_each_continue_rcu(pos, &mddev->disks) { rdev = list_entry(pos, mdk_rdev_t, same_set); if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); @@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + size/512 > 0) /* bitmap runs in to metadata */ goto bad_alignment; - if (rdev->data_offset + mddev->size*2 + if (rdev->data_offset + mddev->dev_sectors > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ goto bad_alignment; @@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < PAGE_SIZE) + else if (chunksize < 512) reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; @@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } + if (bitmap->mddev->degraded) + /* Never clear bits or update events_cleared when degraded */ + success = 0; while (sectors) { int blocks; @@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto } } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, - int degraded) +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) { bitmap_counter_t *bmc; int rv; @@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, return rv; } +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + int blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} + void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; @@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); + bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 00000000000..e98900671ca --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,288 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ + BITMAP_HOSTENDIAN = 0x8000, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + int need_sync; + + /* bitmap spinlock */ + spinlock_t lock; + + long offset; /* offset from superblock if file is NULL */ + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ + + unsigned long flags; + + int allclean; + + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_flush(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf90e7..8695809b24b 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -62,7 +62,10 @@ #define ModeShift 5 #define MaxFault 50 -#include <linux/raid/md.h> +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include "md.h" +#include <linux/seq_file.h> static void faulty_fail(struct bio *bio, int error) @@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) return 0; } +static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + static int run(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -298,7 +312,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) conf->rdev = rdev; - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; reconfig(mddev, mddev->layout, -1); @@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality = .stop = stop, .status = status, .reconfig = reconfig, + .size = faulty_size, }; static int __init raid_init(void) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b21847..7a36e38393a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,7 +16,11 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/raid/linear.h> +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> +#include "md.h" +#include "linear.h" /* * find which device holds a particular offset @@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits) return ret; } +static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return conf->array_sectors; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->num_sectors = rdev->size * 2; - conf->array_sectors += rdev->size * 2; + disk->num_sectors = rdev->sectors; + conf->array_sectors += rdev->sectors; cnt++; } @@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev) if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = conf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_sectors = newconf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } @@ -381,6 +395,7 @@ static struct mdk_personality linear_personality = .stop = linear_stop, .status = linear_status, .hot_add_disk = linear_add, + .size = linear_size, }; static int __init linear_init (void) diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 00000000000..bf8179587f9 --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,29 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + mdk_rdev_t *rdev; + sector_t num_sectors; + sector_t start_sector; +}; + +typedef struct dev_info dev_info_t; + +struct linear_private_data +{ + struct linear_private_data *prev; /* earlier version */ + dev_info_t **hash_table; + sector_t spacing; + sector_t array_sectors; + int sector_shift; /* shift before dividing + * by spacing + */ + dev_info_t disks[0]; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index a307f87eb90..ed5727c089a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,9 +33,9 @@ */ #include <linux/kthread.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> #include <linux/sysctl.h> +#include <linux/seq_file.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> @@ -45,11 +45,10 @@ #include <linux/reboot.h> #include <linux/file.h> #include <linux/delay.h> - -#define MAJOR_NR MD_MAJOR - -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 +#include <linux/raid/md_p.h> +#include <linux/raid/md_u.h> +#include "md.h" +#include "bitmap.h" #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) @@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ) -static int md_fail_request(struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static int md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + mddev_t *mddev = q->queuedata; + int rv; + if (mddev == NULL || mddev->pers == NULL) { + bio_io_error(bio); + return 0; + } + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + rv = mddev->pers->make_request(q, bio); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); + + return rv; +} + +static void mddev_suspend(mddev_t *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + /* we now know that no code is executing in the personality module, + * except possibly the tail end of a ->bi_end_io function, but that + * is certain to complete before the module has a chance to get + * unloaded + */ +} + +static void mddev_resume(mddev_t *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); } + static inline mddev_t *mddev_get(mddev_t *mddev) { atomic_inc(&mddev->active); @@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) init_timer(&new->safemode_timer); atomic_set(&new->active, 1); atomic_set(&new->openers, 0); + atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); @@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev) return mutex_lock_interruptible(&mddev->reconfig_mutex); } +static inline int mddev_is_locked(mddev_t *mddev) +{ + return mutex_is_locked(& |